Beispiel #1
0
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"),
                                           open("etl/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"),
                                       open("etl/conns.yaml"))

        dtype = {
            'aggregate_level': 'UInt8',
            'service_id': 'UInt16',
            'service': 'String',
        }

        download_data = DownloadStep(connector=source_connector)
        unzip_step = UnzipStep(pattern=r"\.csv$")
        extract_step = ExtractStep()
        consolidation_step = ConsolidationStep()
        load_step = LoadStep("dim_shared_eb02",
                             db_connector,
                             if_exists="append",
                             dtype=dtype,
                             pk=['service_id'],
                             nullable_list=['aggregate_level'])

        for year in range(2000, 2017 + 1):
            params['year'] = year

            pp = AdvancedPipelineExecutor(params)
            pp = pp.next(download_data).foreach(unzip_step).next(extract_step)
            pp.run_pipeline()

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(consolidation_step).next(load_step)

        return pp.run_pipeline()
    def run(params, **kwargs):
        source_connector = Connector.fetch(
            params.get("source_connector"),
            open("etl/countries/russia/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"),
                                       open("etl/conns.yaml"))

        dtype = copy.deepcopy(DTYPE)
        dtype['hs6_id'] = 'String'

        download_data = DownloadStep(connector=source_connector)
        extract_step = ExtractStep()
        load_step = LoadStep("trade_s_rus_m_hs",
                             db_connector,
                             if_exists="append",
                             dtype=dtype,
                             pk=[
                                 'trade_flow_id', 'time_id', 'country_id',
                                 'region_id', 'district_id', 'hs10_id'
                             ],
                             nullable_list=['unit_short_name'])

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).next(extract_step).next(load_step)

        return pp.run_pipeline()
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"), open("etl/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"), open("etl/conns.yaml"))

        dtype = {
            'time_id':                 'UInt32',
            'trade_flow_id':           'UInt8',
            'reporter_id':             'UInt32',
            'partner_id':              'UInt32',
            'qty_unit_id':             'String',
            'qty_unit':                'String',
            'qty':                     'Float64',
            'netweight_kg':            'Float64',
            'trade_value_us_dollars':  'UInt64',
            'hs6_id':                  'UInt32'
        }

        download_data = DownloadStep(connector=source_connector)
        unzip_step = UnzipStep(pattern=r"\.csv$")
        extract_step = ExtractStep()
        load_step = LoadStep(
            "trade_i_comtrade_m_hs", db_connector, if_exists="append", dtype=dtype,
            pk=['reporter_id', 'trade_flow_id', 'time_id'],
            nullable_list=['qty', 'trade_value_us_dollars']
        )

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).foreach(unzip_step).next(extract_step).next(load_step)

        return pp.run_pipeline()
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"),
                                           open("etl/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"),
                                       open("etl/conns.yaml"))

        dtype = {
            'id': 'String',
            'id_num': 'UInt32',
            'iso3': 'String',
            'iso2': 'String',
            'continent': 'String',
            'color': 'String',
            'name': 'String',
        }

        download_data = DownloadStep(connector=source_connector)
        extract_step = ExtractStep()
        load_step = LoadStep("dim_shared_countries",
                             db_connector,
                             if_exists="append",
                             dtype=dtype,
                             pk=['id_num'])

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).next(extract_step).next(load_step)

        return pp.run_pipeline()
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"), open("etl/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"), open("etl/conns.yaml"))

        dtype = {
            'year': 'UInt32',
            'hs6_id': 'UInt32',
            'exporter': 'UInt32',
            'importer': 'UInt32',
            'trade_value_thou_us_dollars': 'Float64',
            'trade_value_us_dollars': 'Float64',
            'qty_tons': 'Float64',
        }

        download_data = DownloadStep(connector=source_connector)
        unzip_step = UnzipStep(pattern=r"\.csv$")
        extract_step = ExtractStep()

        load_step = LoadStep(
            "trade_i_baci_a_{}".format(params['hs_code']), db_connector, if_exists="append", dtype=dtype,
            pk=['exporter', 'importer', 'year'], nullable_list=['qty_tons']
        )

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).foreach(unzip_step).next(extract_step).next(load_step)

        return pp.run_pipeline()
Beispiel #6
0
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"), open("etl/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"), open("etl/conns.yaml"))

        dtype = {
            'chapter':         'String',
            'chapter_name':    'String',
            'chapter_id':      'UInt8',
            'hs2':             'String',
            'hs2_name':        'String',
            'hs2_id':          'UInt8',
            'hs4':             'String',
            'hs4_name':        'String',
            'hs4_id':          'UInt16',
            'hs6':             'String',
            'hs6_name':        'String',
            'hs6_id':          'UInt32',
        }

        download_data = DownloadStep(connector=source_connector)
        extract_step = ExtractStep()
        load_step = LoadStep(
            "dim_shared_hs{}".format(params['hs_code']), db_connector,
            if_exists="append", dtype=dtype,
            pk=['hs6_id']
        )

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).next(extract_step).next(load_step)

        return pp.run_pipeline()
Beispiel #7
0
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"),
                                           open("etl/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"),
                                       open("etl/conns.yaml"))

        dtype = {
            'year': 'UInt16',
            'trade_flow_id': 'UInt8',
            'reporter_id': 'UInt32',
            'partner_id': 'UInt32',
            'service_id': 'UInt16',
            'trade_value_us_dollars': 'Int64',
        }

        download_data = DownloadStep(connector=source_connector)
        unzip_step = UnzipStep(pattern=r"\.csv$")
        extract_step = ExtractStep()
        load_step = LoadStep("services_i_comtrade_a_eb02",
                             db_connector,
                             if_exists="append",
                             dtype=dtype,
                             pk=['reporter_id', 'trade_flow_id', 'year'],
                             nullable_list=['trade_value_us_dollars'])

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).foreach(unzip_step).next(
            extract_step).next(load_step)

        return pp.run_pipeline()
Beispiel #8
0
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"),
                                           open("etl/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"),
                                       open("etl/conns.yaml"))

        dtype = {
            'geo_id': 'String',
            'short_name': 'String',
            'table_name': 'String',
            'long_name': 'String',
            'two_alpha_code': 'String',
            'two_alpha_code_lower': 'String',
            'currency_unit': 'String',
            'special_notes': 'String',
            'region': 'String',
            'income_group': 'String',
            'wb_2_code': 'String',
            'national_accounts_base_year': 'String',
            'national_accounts_reference_year': 'String',
            'sna_price_valuation': 'String',
            'lending_category': 'String',
            'other_groups': 'String',
            'system_of_national_accounts': 'String',
            'alternative_conversion_factor': 'String',
            'ppp_survey_year': 'String',
            'balance_of_payments_manual_in_use': 'String',
            'external_debt_reporting_status': 'String',
            'system_of_trade': 'String',
            'government_accounting_concept': 'String',
            'imf_data_dissemination_standard': 'String',
            'latest_population_census': 'String',
            'latest_household_survey': 'String',
            'income_and_expenditure_source': 'String',
            'vital_registration_complete': 'String',
            'latest_agricultural_census': 'String',
            'latest_industrial_data': 'UInt32',
            'latest_trade_data': 'UInt32',
        }

        nullable_list = list(dtype.keys())[3:]

        download_data = DownloadStep(connector=source_connector)
        extract_step = ExtractStep()
        load_step = LoadStep("dim_shared_geo",
                             db_connector,
                             if_exists="append",
                             dtype=dtype,
                             pk=['geo_id'],
                             nullable_list=nullable_list)

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).next(extract_step).next(load_step)

        return pp.run_pipeline()
Beispiel #9
0
    def run(params, **kwargs):
        source_connector = grab_connector(__file__,
                                          params.get("source-connector"))
        postgres_connector = grab_connector(__file__, "postgres-local")

        step1 = DownloadStep(connector=source_connector)
        step2 = TransformStep()
        step3 = LoadStep("brazil_ncm", postgres_connector, if_exists="append")

        pipeline = AdvancedPipelineExecutor(params)
        pipeline = pipeline.next(step1).next(step2).next(step3)

        return pipeline.run_pipeline()
Beispiel #10
0
    def run(params, **kwargs):
        source_connector = grab_connector(__file__,
                                          params.get("source-connector"))
        postgres_connector = grab_connector(__file__, "postgres-local")

        step1 = DownloadStep(connector=source_connector)
        step2 = LoadStep(params.get("dim") + "_table",
                         postgres_connector,
                         if_exists="replace")

        pipeline = AdvancedPipelineExecutor(params)
        pipeline = pipeline.next(step1).next(step2)

        return pipeline.run_pipeline()
Beispiel #11
0
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"), open("etl/countries/russia/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"), open("etl/conns.yaml"))

        download_data = DownloadStep(connector=source_connector)
        extract_step = ExtractStep()
        load_step = LoadStep(
            "dim_rus_countries", db_connector, if_exists="append", dtype=DTYPE, pk=['id'], nullable_list=['name']
        )

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).next(extract_step).next(load_step)

        return pp.run_pipeline()
Beispiel #12
0
    def run(params, **kwargs):
        source_connector = grab_connector(__file__, params.get("source"))
        db_connector = grab_connector(__file__, params.get("db"))

        step1 = DownloadStep(connector=source_connector)
        step2 = TransformStep()
        step3 = LoadStep("trade_i_mdic_m_hs",
                         db_connector,
                         if_exists="append",
                         pk=["time_id"])

        pipeline = AdvancedPipelineExecutor(params)
        pipeline = pipeline.next(step1).next(step2).next(step3)

        return pipeline.run_pipeline()
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"), open("etl/countries/sweden/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"), open("etl/conns.yaml"))

        download_data = DownloadStep(connector=source_connector)
        extract_step = ExtractStep()
        load_step = LoadStep(
            "trade_s_swe_m_hs", db_connector, if_exists="append", dtype=DTYPE,
            pk=['hs6_id', 'trade_flow_id', 'partner_iso3'],
            nullable_list=['amount']
        )

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).next(extract_step).next(load_step)

        return pp.run_pipeline()
Beispiel #14
0
    def run(params, **kwargs):
        db_connector = Connector.fetch(params.get("db_connector"), open("etl/conns.yaml"))

        dtype = {
            'trade_flow_id':       'UInt8',
            'trade_flow_name':     'String',
        }

        extract_step = ExtractStep()
        load_step = LoadStep(
            "dim_shared_trade_flow", db_connector,
            if_exists="append", dtype=dtype, pk=['trade_flow_id']
        )

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(extract_step).next(load_step)

        return pp.run_pipeline()
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"),
                                           open("etl/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"),
                                       open("etl/conns.yaml"))

        dtype = {
            'indicator_id': 'String',
            'topic': 'String',
            'indicator_name': 'String',
            'short_definition': 'String',
            'long_definition': 'String',
            'unit_of_measure': 'String',
            'periodicity': 'String',
            'base_period': 'String',
            'other_notes': 'String',
            'aggregation_method': 'String',
            'limitations_and_expectations': 'String',
            'notes_from_original_source': 'String',
            'general_comments': 'String',
            'source': 'String',
            'statistical_concept_and_methodology': 'String',
            'development_relevance': 'String',
            'related_source_links': 'String',
            'other_web_links': 'String',
            'related_indicators': 'String',
            'license_type': 'String'
        }

        nullable_list = list(dtype.keys())[2:]

        download_data = DownloadStep(connector=source_connector)
        extract_step = ExtractStep()
        load_step = LoadStep("dim_shared_indicators",
                             db_connector,
                             if_exists="append",
                             dtype=dtype,
                             pk=['indicator_id'],
                             nullable_list=nullable_list)

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).next(extract_step).next(load_step)

        return pp.run_pipeline()
    def run(params, **kwargs):
        db_connector = grab_connector(__file__, params.get("db"))

        step1 = TimeCreationStep()
        step2 = LoadStep("dim_shared_time",
                         db_connector,
                         if_exists="replace",
                         pk=["time_id"])
        step3 = ProductCreationStep()
        step4 = LoadStep("dim_shared_products",
                         db_connector,
                         if_exists="replace",
                         pk=["hs4_id"])
        step5 = CountryCreationStep()
        step6 = LoadStep("dim_shared_countries",
                         db_connector,
                         if_exists="replace",
                         pk=["id"])
        step7 = StateCreationStep()
        step8 = LoadStep("dim_shared_states",
                         db_connector,
                         if_exists="replace",
                         pk=["state_id"])
        step9 = MunicipalityCreationStep()
        step10 = LoadStep("dim_shared_municipalities",
                          db_connector,
                          if_exists="replace",
                          pk=["municipality_id"])
        step11 = FlowCreationStep()
        step12 = LoadStep("dim_shared_flow",
                          db_connector,
                          if_exists="replace",
                          pk=["flow_id"])

        pipeline = AdvancedPipelineExecutor(params)
        pipeline = pipeline.next(step1).next(step2).next(step3).next(
            step4).next(step5).next(step6).next(step7).next(step8).next(
                step9).next(step10).next(step11).next(step12)

        return pipeline.run_pipeline()
Beispiel #17
0
    def run(params, **kwargs):
        source_connector = Connector.fetch(params.get("source_connector"), open("etl/conns.yaml"))
        db_connector = Connector.fetch(params.get("db_connector"), open("etl/conns.yaml"))

        dtype = {
            'geo_id':                  'String',
            'indicator_id':            'String',
            'year':                    'UInt32',
            'mea':                     'Float64'
        }

        download_data = DownloadStep(connector=source_connector)
        extract_step = ExtractStep()
        load_step = LoadStep(
            "indicators_i_wdi_a", db_connector, if_exists="append", dtype=dtype,
            pk=['geo_id'], nullable_list=['mea']
        )

        pp = AdvancedPipelineExecutor(params)
        pp = pp.next(download_data).next(extract_step).next(load_step)

        return pp.run_pipeline()