def run(params, **kwargs): source_connector = grab_connector(__file__, params.get("source-connector")) postgres_connector = grab_connector(__file__, "postgres-local") step1 = DownloadStep(connector=source_connector) step2 = TransformStep() step3 = LoadStep("brazil_ncm", postgres_connector, if_exists="append") pipeline = AdvancedPipelineExecutor(params) pipeline = pipeline.next(step1).next(step2).next(step3) return pipeline.run_pipeline()
def run(params, **kwargs): source_connector = grab_connector(__file__, params.get("source-connector")) postgres_connector = grab_connector(__file__, "postgres-local") step1 = DownloadStep(connector=source_connector) step2 = LoadStep(params.get("dim") + "_table", postgres_connector, if_exists="replace") pipeline = AdvancedPipelineExecutor(params) pipeline = pipeline.next(step1).next(step2) return pipeline.run_pipeline()
def run(params, **kwargs): source_connector = grab_connector(__file__, params.get("source")) db_connector = grab_connector(__file__, params.get("db")) step1 = DownloadStep(connector=source_connector) step2 = TransformStep() step3 = LoadStep("trade_i_mdic_m_hs", db_connector, if_exists="append", pk=["time_id"]) pipeline = AdvancedPipelineExecutor(params) pipeline = pipeline.next(step1).next(step2).next(step3) return pipeline.run_pipeline()
def steps(): source = grab_connector(__file__, "http-local") db = grab_connector(__file__, "clickhouse-local") step1 = DownloadStep(connector=source, connector_path=__file__) step2 = TransformationStep() dtypes = {"int_column": "UInt32", "str_column": "String"} step3 = LoadStep("table_name", connector=db, pk=["primary_key"], dtype=dtypes, nullable_list=["null_col"]) return [step1, step2, step3]
def steps(params): db_connector = grab_connector(__file__, params.get("output-db")) open_step = OpenStep() tidy_step = TidyStep() region_step = RegionDimensionStep() load_region = LoadStep(table_name="tic_dim_region", connector=db_connector, if_exists="drop", pk=["region_id"], dtype={ "region_id": "UInt8", "region_name": "String" }, nullable_list=[]) variable_step = VariableDimensionStep() load_variable = LoadStep(table_name="tic_dim_variable", connector=db_connector, if_exists="drop", pk=["response_id"], dtype={ "response_id": "UInt8", "variable_name": "String", "response_name": "String", "combined": "String" }, nullable_list=[]) fact_step = FactTableStep() load_fact = LoadStep(table_name="tic_fact", connector=db_connector, if_exists="drop", pk=["region_id"], dtype={ "region_id": "UInt8", "data_origin_id": "UInt8", "response_id": "UInt8", "year": "UInt8", "percentage": "Float64" }, nullable_list=[]) if params.get("ingest") == True: steps = [ open_step, tidy_step, region_step, load_region, variable_step, load_variable, fact_step, load_fact ] else: steps = [ open_step, tidy_step, region_step, variable_step, fact_step ] return steps
def steps(params): connector = grab_connector('etl/budget_transparency/conns.yaml', 'budget-transparency-wildcard') db_connector = Connector.fetch("clickhouse-database", open("etl/conns.yaml")) download_step = WildcardDownloadStep(connector=connector) read_step = ReadStep(save_result_key="global_df") dim_fun_step = DimFunStep(save_result_key="dim_fun") dim_dep_step = DimDepStep(save_result_key="dim_dep") dim_exp_step = DimExpTypeStep(save_result_key="dim_exp") dim_cap_step = DimCapStep(save_result_key="dim_cap") fact_step = FactStep() ld_fun = LoadStep('dim_fun_budget_transparency', db_connector, if_exists="drop", pk=['subfunction_id'], dtype=dt_fun) ld_dep = LoadStep('dim_dep_budget_transparency', db_connector, if_exists="drop", pk=['department_id'], dtype=dt_dep) ld_exp = LoadStep('dim_exp_budget_transparency', db_connector, if_exists="drop", pk=['exp_type_id'], dtype=dt_exp) ld_cap = LoadStep('dim_cap_budget_transparency', db_connector, if_exists="drop", pk=['concept_id'], dtype=dt_cap) ld_fact = LoadStep('budget_transparency', db_connector, if_exists="drop", pk=['quarter_id'], dtype=dt_fact) return [ download_step, read_step, dim_fun_step, ld_fun, dim_dep_step, ld_dep, dim_exp_step, ld_exp, dim_cap_step, ld_cap, fact_step, ld_fact ]
def steps(params): db_connector = grab_connector(__file__, params.get("output-db")) open_step = OpenStep() transform_step = TransformStep() load_step = LoadStep(table_name="wakanda_trade", connector=db_connector, if_exists="append", pk=["time_id"]) return [open_step, transform_step, load_step]
def steps(params): source_connector = grab_connector(__file__, params.get("input-file")) db_connector = grab_connector(__file__, params.get("output-db")) country_step = CountryDimStep() load_country = LoadStep(table_name="dim_country", connector=db_connector, if_exists="drop", pk=["country_id"], dtype={ "country_id": "Int64", "country_name": "String", "country_code": "String" }, nullable_list=[]) download_step = DownloadStep(connector=source_connector, force=False) transform_step = TransformStep() load_step = LoadStep(table_name="gdp_fact", connector=db_connector, if_exists="drop", pk=["year"], dtype={ "year": "Int64", "country_id": "Int64", "gdp_value": "Float64" }, nullable_list=[]) steps = [ country_step, load_country, download_step, transform_step, load_step ] if params.get("ingest") else [ country_step, download_step, transform_step ] return steps
def steps(params): db_connector = grab_connector(__file__, params.get("db")) open_step = OpenStep() transform_step = TransformStep() load_step = LoadStep(table_name="unemployment_claims", connector=db_connector, if_exists="append", pk=["week_ended"]) steps = [open_step, transform_step, load_step ] if params.get("ingest") else [open_step, transform_step] return steps
def steps(params): connector = grab_connector('etl/insurers/conns.yaml', 'insurers') db_connector = Connector.fetch("clickhouse-database", open("etl/conns.yaml")) dl_step = DownloadStep( connector=connector ) read_step = ReadStep(save_result_key="global_df") dim_med_supply_step = DimMedicalSupply(save_result_key="dim_medical_supply") ld_dim_step = LoadStep('dim_insurers_med_supply', db_connector, if_exists="drop", pk=['supply_id'], dtype=dt_dim_med_supply) x_step = XformStep() ld_fact_step = LoadStep('insurers', db_connector, if_exists="drop", pk=['patient_id', 'ingress_date', 'egress_date', 'supply_id', 'hospital_group', 'diagnosis'], dtype=dt_insurers_fact) return [dl_step, read_step, dim_med_supply_step, ld_dim_step, x_step, ld_fact_step]
def run(params, **kwargs): db_connector = grab_connector(__file__, params.get("db")) step1 = TimeCreationStep() step2 = LoadStep("dim_shared_time", db_connector, if_exists="replace", pk=["time_id"]) step3 = ProductCreationStep() step4 = LoadStep("dim_shared_products", db_connector, if_exists="replace", pk=["hs4_id"]) step5 = CountryCreationStep() step6 = LoadStep("dim_shared_countries", db_connector, if_exists="replace", pk=["id"]) step7 = StateCreationStep() step8 = LoadStep("dim_shared_states", db_connector, if_exists="replace", pk=["state_id"]) step9 = MunicipalityCreationStep() step10 = LoadStep("dim_shared_municipalities", db_connector, if_exists="replace", pk=["municipality_id"]) step11 = FlowCreationStep() step12 = LoadStep("dim_shared_flow", db_connector, if_exists="replace", pk=["flow_id"]) pipeline = AdvancedPipelineExecutor(params) pipeline = pipeline.next(step1).next(step2).next(step3).next( step4).next(step5).next(step6).next(step7).next(step8).next( step9).next(step10).next(step11).next(step12) return pipeline.run_pipeline()
def steps(params): source_connector = grab_connector(__file__, "resources-data") db_connector = Connector.fetch("clickhouse-database", open("../conns.yaml")) dtype = { "mun_id": "UInt16", "clues_id": "String", "type_id": "String", "year": "UInt16", "resources_subcategories_id": "UInt16", "total": "UInt32" } download_step = DownloadStep( connector=source_connector, force=True ) transform_step = TransformStep() load_step = LoadStep("health_resources", db_connector, if_exists="append", pk=["mun_id"], dtype=dtype) return [download_step, transform_step, load_step]
def steps(params): source_connector = grab_connector("../conns.yaml", "fincen-source") #db_connector = grab_connector("../conns.yaml", params.get("output-db")) dl_step = DownloadStep(connector=source_connector) unzip_step = UnzipStep(pattern=r"\.csv$") date_step = DateStep() #load_fact = LoadStep( # table_name="tic_fact", # connector=db_connector, # if_exists="drop", # pk=["region_id"], # dtype={"region_id":"UInt8","data_origin_id":"UInt8","response_id":"UInt8","year":"UInt8","percentage":"Float64"}, # nullable_list=[] #) if params.get("ingest") == True: steps = [] #[open_step, tidy_step, region_step, load_region, variable_step, load_variable, fact_step, load_fact] else: steps = [dl_step, unzip_step, date_step] return steps
import pandas as pd from tabulate import tabulate from bamboo_lib.helpers import grab_connector, query_to_df db_connector = grab_connector(__file__, "clickhouse-remote") # Get table names names_df = query_to_df(db_connector, "SHOW TABLES") tables = names_df["name"].tolist() # Print row count for t in [table for table in tables if "acs" in table]: print("\nTABLE: {}".format(t)) query = "SELECT year, COUNT(*) FROM {} GROUP BY year ORDER BY year;".format( t) df = query_to_df(db_connector, query, col_headers=["year", "count"]) new_df = pd.DataFrame({ "year": df["year"].tolist(), "count": ["{:,}".format(int(c)) for c in df["count"]] }) print(tabulate(new_df, headers="keys", tablefmt="psql", showindex=False))