Beispiel #1
0
def main():
    for row in inputdata:
        extractdomaininfo(row)
        extractserverinfo(row)
        row["size"] = pygrametl.getint(row["size"])  # Convert to an int
        # Add the data to the dimension tables and the fact table
        row["pageid"] = pagesf.scdensure(row)
        row["dateid"] = datedim.ensure(row, {"date": "downloaddate"})
        row["testid"] = testdim.lookup(row, {"testname": "test"})
        facttbl.insert(row)
    connection.commit()
def main():
    for row in inputdata:
        extractdomaininfo(row)
        extractserverinfo(row)
        row['size'] = pygrametl.getint(row['size']) # Convert to an int
        # Add the data to the dimension tables and the fact table
        row['pageid'] = pagesf.scdensure(row)
        row['dateid'] = datedim.ensure(row, {'date':'downloaddate'})
        row['testid'] = testdim.lookup(row, {'testname':'test'})
        facttbl.insert(row)
    connection.commit()
def main():
    for row in inputdata:
        extractdomaininfo(row)
        extractserverinfo(row)
        row['size'] = pygrametl.getint(row['size'])  # Convert to an int
        # Add the data to the dimension tables and the fact table
        row['pageid'] = pagesf.scdensure(row)
        row['dateid'] = datedim.ensure(row, {'date': 'downloaddate'})
        row['testid'] = testdim.lookup(row, {'testname': 'test'})
        facttbl.insert(row)
    connection.commit()
def load_str_dot(file, config):
    """Method for loading one subtraject file of WOB ZZ DOT

    Main ETL method for WOB ZZ subtrajecten.
    Requires active pygrametl connection as global.
    """
    global connection
    paths = {'data_path': config.get('wob_zz', 'data_path'),
             'staging_path': config.get('wob_zz', 'staging_path')}
    names_STR = ['datum_aanmaak', 'landcode', 'geslacht',
                 'verwijzend_specialisme', 'zorgtrajectnummer',
                 'zorgtrajectnummer_parent', 'begindatum_zorgtraject',
                 'einddatum_zorgtraject', 'declaratiedatasetnummer',
                 'subtrajectnummer', 'subtraject_id', 'declaratiecode',
                 'behandelend_specialisme', 'zorgtypecode', 'zorgvraagcode',
                 'typerende_diagnose', 'icd10_vertaling_diagnose',
                 'hoofdtraject_indicatie', 'zorgproductcode',
                 'dbc_reden_sluiten', 'aanspraak_zvw',
                 'aanspraak_zvw_toegepast', 'zorgact_met_machtiging',
                 'oranje_zorgactiviteit', 'zorgactiviteitvertaling_toegepast',
                 'begindatum_subtraject', 'einddatum_subtraject',
                 'declaratiedatum', 'dbc_ziekenhuiskosten',
                 'honorarium_totaal']

    source_file = bz2.open(paths['data_path'] + '/' + file, mode='rt')
    source = csv.DictReader(source_file, delimiter=';',
                            quotechar='"', fieldnames=names_STR)

    starttime = time.localtime()
    start_s = time.time()
    print('{} - Start processing file: {}'.
          format(time.strftime('%H:%M:%S', starttime), file))

    name_mapping = {
        'afl_afsluitreden_code'         : 'dbc_reden_sluiten',
        #'beh_dbc_specialisme_code'      : 'behandelend_specialisme',
        #'beh_dbc_behandeling_code'      : 'behandelcode',
        'dcl_declaratie_code'           : 'declaratiecode',
        'dia_dbc_specialisme_code'      : 'behandelend_specialisme',
        'dia_dbc_diagnose_code'         : 'typerende_diagnose',
        'geslacht'                      : 'geslacht',
        'heeft_zorgactiviteit_met_machtiging': 'zorgact_met_machtiging',
        'heeft_oranje_zorgactiviteit'   : 'oranje_zorgactiviteit',
        'is_aanspraak_zvw'              : 'aanspraak_zvw',
        'is_aanspraak_zvw_toegepast'    : 'aanspraak_zvw_toegepast',
        'is_hoofdtraject'               : 'hoofdtraject_indicatie',
        'is_zorgactiviteitvertaling_toegepast': 'zorgactiviteitvertaling_toegepast',
        'lnd_land_code'                 : 'landcode',
        'stn_subtraject_id'             : 'subtraject_id',
        'stn_subtrajectnummer'          : 'subtrajectnummer',
        'stn_zorgtrajectnummer'         : 'zorgtrajectnummer',
        'stn_zorgtrajectnummer_parent'  : 'zorgtrajectnummer_parent',
        'zgt_dbc_specialisme_code'      : 'behandelend_specialisme',
        'zgt_dbc_zorgtype_code'         : 'zorgtypecode',
        'zgv_dbc_specialisme_code'      : 'behandelend_specialisme',
        'zgv_dbc_zorgvraag_code'        : 'zorgvraagcode',
        'zpr_dbc_zorgproduct_code'      : 'zorgproductcode',
        'fct_omzet_ziekenhuis'          : 'dbc_ziekenhuiskosten',
        'fct_omzet_honorarium_totaal'   : 'honorarium_totaal',

        }

    for row in source:

        # convert datecolumns to appropriate format
        row['begindatum_zorgtraject'] = parse_dates(row['begindatum_zorgtraject'])
        row['einddatum_zorgtraject'] = parse_dates(row['einddatum_zorgtraject'])
        row['begindatum_subtraject'] = parse_dates(row['begindatum_subtraject'])
        row['einddatum_subtraject'] = parse_dates(row['einddatum_subtraject'])
        row['declaratiedatum'] = parse_dates(row['declaratiedatum'])

        # ensure DBC codes are filled to right length
        row['verwijzend_specialisme'] = \
            parse_codes(row['verwijzend_specialisme'], 4, '_?_')
        row['behandelend_specialisme'] = \
            parse_codes(row['behandelend_specialisme'], 4, '_?_')
        row['zorgtypecode'] = parse_codes(row['zorgtypecode'], 2, '??')
        row['zorgvraagcode'] = parse_codes(row['zorgvraagcode'], 4, '_?_')
        row['typerende_diagnose'] = parse_codes(row['typerende_diagnose'], 4, '_?_')
        row['zorgproductcode'] = parse_codes(row['zorgproductcode'], 9, '_?_')

        # convert geslacht into int conform COD046_NEN / Vektis
        row['geslacht'] = etl.getint(row['geslacht'], default=0)

        # convert booleans
        row['hoofdtraject_indicatie'] = parse_boolean(row['hoofdtraject_indicatie'])
        row['aanspraak_zvw'] = parse_boolean(row['aanspraak_zvw'])
        row['aanspraak_zvw_toegepast'] = parse_boolean(row['aanspraak_zvw_toegepast'])
        row['oranje_zorgactiviteit'] = parse_boolean(row['oranje_zorgactiviteit'])
        row['zorgact_met_machtiging'] = parse_boolean(row['zorgact_met_machtiging'])
        row['zorgactiviteitvertaling_toegepast'] = parse_boolean(row['zorgactiviteitvertaling_toegepast'])

        # convert money values into decimals
        row['dbc_ziekenhuiskosten'] = parse_money(row['dbc_ziekenhuiskosten'])
        row['honorarium_totaal'] = parse_money(row['honorarium_totaal'])

        # derive dimension_ids
        row['beh_id'] = -1 # no behandelcodes in DOT per 2012-01-01
        row['dag_id_begindatum_zorgtraject'] = DIM_DAG.lookup(row, {'dag_datum': 'begindatum_zorgtraject'})
        row['dag_id_einddatum_zorgtraject'] = DIM_DAG.lookup(row, {'dag_datum': 'einddatum_zorgtraject'})
        row['dag_id_begindatum_subtraject'] = DIM_DAG.lookup(row, {'dag_datum': 'begindatum_subtraject'})
        row['dag_id_einddatum_subtraject'] = DIM_DAG.lookup(row, {'dag_datum': 'einddatum_subtraject'})
        row['dag_id_declaratiedatum'] = DIM_DAG.lookup(row, {'dag_datum': 'declaratiedatum'})
        row['dia_id'] = DIM_DIAGNOSE.ensure(row, name_mapping)
        row['stn_id'] = DIM_SUBTRAJECTNUMMER.ensure(row, name_mapping)
        row['zgt_id'] = DIM_ZORGTYPE.ensure(row, name_mapping)
        row['zgv_id'] = DIM_ZORGVRAAG.ensure(row, name_mapping)
        row['zpr_id'] = DIM_ZORGPRODUCT.ensure(row, name_mapping)
        row['zvs_id_behandelend'] = \
            DIM_ZORGVERLENERSOORT.ensure(row, {'zvs_vektis_zorgverlenersoort_code': 'behandelend_specialisme'})
        row['zvs_id_verwijzend'] = \
            DIM_ZORGVERLENERSOORT.ensure(row, {'zvs_vektis_zorgverlenersoort_code': 'verwijzend_specialisme'})

        # insert fact table
        FCT_SUBTRAJECT.insert(row, name_mapping)

    connection.commit()

    end_s = time.time()
    endtime = time.localtime()
    print('{} - Finished processing {}'.
          format(time.strftime('%H:%M:%S', endtime), file))
    print('           Processing time: %0.2f seconds ' % (end_s - start_s))
Beispiel #5
0
def convertsize(row):
    row['size'] = pygrametl.getint(row['size'])
def load_subtraject(file, config):
    """Method for loading one subtraject file of WOB ZZ DOT

    Main ETL method for WOB GGZ subtrajecten.
    Requires active pygrametl connection as global.
    """
    global connection
    paths = {
        'data_path': config.get('wob_ggz', 'data_path'),
        'staging_path': config.get('wob_ggz', 'staging_path')
    }
    column_names = [
        'datum_aanmaak', 'geslacht', 'landcode', 'zorgtrajectnummer',
        'begindatum_zorgtraject', 'einddatum_zorgtraject',
        'primaire_diagnose_code', 'primaire_diagnose_trekken_van',
        'primaire_diagnose_datum', 'dbc_trajectnummer', 'zorgtypecode',
        'circuitcode', 'productgroepcode', 'begindatum_dbc_traject',
        'einddatum_dbc_traject', 'dbc_reden_sluiten_code', 'verkoopprijs_dbc',
        'prestatiecode', 'declaratiecode', 'dbc_tarief', 'verrekenbedrag'
    ]

    source_file = bz2.open(paths['data_path'] + '/' + file,
                           mode='rt',
                           encoding='cp1252')
    source = csv.DictReader(source_file,
                            delimiter=';',
                            quotechar='"',
                            fieldnames=column_names)

    starttime = time.localtime()
    start_s = time.time()
    print('{} - Start processing file: {}'.format(
        time.strftime('%H:%M:%S', starttime), file))

    name_mapping = {
        'geslacht': 'geslacht',
        'lnd_land_code': 'landcode',
        'stn_zorgtrajectnummer': 'zorgtrajectnummer',
        'dia_diagnose_code': 'primaire_diagnose_code',
        'diagnose_trekken_van': 'primaire_diagnose_trekken_van',
        'stn_subtrajectnummer': 'dbc_trajectnummer',
        'zgt_zorgtype_code': 'zorgtypecode',
        'cct_circuit_code': 'circuitcode',
        'prg_productgroep_code': 'productgroepcode',
        'afs_afsluitreden_code': 'dbc_reden_sluiten_code',
        'fct_verkoopprijs': 'verkoopprijs_dbc',
        'psc_dbc_prestatiecode': 'prestatiecode',
        'psc_declaratiecode': 'declaratiecode',
        'fct_tarief': 'dbc_tarief',
        'fct_verrekenbedrag': 'verrekenbedrag'
    }

    for row in source:

        # convert datecolumns to appropriate format
        row['begindatum_zorgtraject'] = parse_dates(
            row['begindatum_zorgtraject'])
        row['einddatum_zorgtraject'] = parse_dates(
            row['einddatum_zorgtraject'])
        row['begindatum_dbc_traject'] = parse_dates(
            row['begindatum_dbc_traject'])
        row['einddatum_dbc_traject'] = parse_dates(
            row['einddatum_dbc_traject'])
        row['primaire_diagnose_datum'] = parse_dates(
            row['primaire_diagnose_datum'])

        # ensure DBC codes are filled to right length
        row['zorgtypecode'] = parse_codes(row['zorgtypecode'], 3, '_?_')
        row['primaire_diagnose_code'] = etl.getstr(
            row['primaire_diagnose_code']).upper()
        row['productgroepcode'] = parse_codes(row['productgroepcode'], 6,
                                              '_?_')
        row['prestatiecode'] = parse_codes(row['prestatiecode'], 12, '_?_')

        # get tinyint codes
        row['dbc_reden_sluiten_code'] = etl.getint(
            row['dbc_reden_sluiten_code'], default=0)
        row['circuitcode'] = etl.getint(row['circuitcode'], default=0)

        # convert geslacht into int conform COD046_NEN / Vektis
        row['geslacht'] = etl.getint(row['geslacht'], default=0)

        # diagnose_trekken_van: 'spatie' = 0, J=1
        row['primaire_diagnose_trekken_van'] = parse_boolean(
            row['primaire_diagnose_trekken_van'], default=0)

        # convert money values into decimals
        row['verkoopprijs_dbc'] = parse_money(row['verkoopprijs_dbc'])
        row['dbc_tarief'] = parse_money(row['dbc_tarief'])
        row['verrekenbedrag'] = parse_money(row['verrekenbedrag'])

        # derive dimension_ids
        row['stn_id'] = DIM_SUBTRAJECTNUMMER.ensure(row, name_mapping)
        row['dag_id_begindatum_zorgtraject'] = DIM_DAG.lookup(
            row, {'dag_datum': 'begindatum_zorgtraject'})
        row['dag_id_einddatum_zorgtraject'] = DIM_DAG.lookup(
            row, {'dag_datum': 'einddatum_zorgtraject'})
        row['dag_id_begindatum_subtraject'] = DIM_DAG.lookup(
            row, {'dag_datum': 'begindatum_dbc_traject'})
        row['dag_id_einddatum_subtraject'] = DIM_DAG.lookup(
            row, {'dag_datum': 'einddatum_dbc_traject'})
        row['dag_id_diagnose'] = DIM_DAG.lookup(
            row, {'dag_datum': 'primaire_diagnose_datum'})
        row['zgt_id'] = DIM_ZORGTYPE.ensure(row, name_mapping)
        row['dia_id_primair'] = DIM_DIAGNOSE.ensure(row, name_mapping)
        row['prg_id'] = DIM_PRODUCTGROEP.ensure(row, name_mapping)
        row['psc_id'] = DIM_PRESTATIECODE.ensure(row, name_mapping)
        row['cct_id'] = DIM_CIRCUIT.ensure(row, name_mapping)
        row['afs_id'] = DIM_AFSLUITREDEN.ensure(row, name_mapping)
        row['lnd_id'] = DIM_LAND.ensure(row, name_mapping)

        # insert fact table
        FCT_SUBTRAJECT.insert(row, name_mapping)

    connection.commit()

    end_s = time.time()
    endtime = time.localtime()
    print('{} - Finished processing {}'.format(
        time.strftime('%H:%M:%S', endtime), file))
    print('           Processing time: %0.2f seconds ' % (end_s - start_s))
def load_subtraject(file, config):
    """Method for loading one subtraject file of WOB ZZ DOT

    Main ETL method for WOB GGZ subtrajecten.
    Requires active pygrametl connection as global.
    """
    global connection
    paths = {'data_path': config.get('wob_ggz', 'data_path'),
             'staging_path': config.get('wob_ggz', 'staging_path')}
    column_names = ['datum_aanmaak',
                 'geslacht', 
                 'landcode', 
                 'zorgtrajectnummer',
                 'begindatum_zorgtraject',
                 'einddatum_zorgtraject',
                 'primaire_diagnose_code',
                 'primaire_diagnose_trekken_van',
                 'primaire_diagnose_datum',
                 'dbc_trajectnummer',
                 'zorgtypecode',
                 'circuitcode',
                 'productgroepcode',
                 'begindatum_dbc_traject',
                 'einddatum_dbc_traject',
                 'dbc_reden_sluiten_code',
                 'verkoopprijs_dbc',
                 'prestatiecode',
                 'declaratiecode',
                 'dbc_tarief',
                 'verrekenbedrag'
    ]

    source_file = bz2.open(paths['data_path'] + '/' + file,
                           mode='rt', encoding='cp1252')
    source = csv.DictReader(source_file, delimiter=';',
                            quotechar='"',
                            fieldnames=column_names)

    starttime = time.localtime()
    start_s = time.time()
    print('{} - Start processing file: {}'.
          format(time.strftime('%H:%M:%S', starttime), file))

    name_mapping = {
        'geslacht': 'geslacht',
        'lnd_land_code': 'landcode',
        'stn_zorgtrajectnummer': 'zorgtrajectnummer',
        'dia_diagnose_code': 'primaire_diagnose_code',
        'diagnose_trekken_van': 'primaire_diagnose_trekken_van',
        'stn_subtrajectnummer': 'dbc_trajectnummer',
        'zgt_zorgtype_code': 'zorgtypecode',
        'cct_circuit_code': 'circuitcode',
        'prg_productgroep_code': 'productgroepcode',
        'afs_afsluitreden_code': 'dbc_reden_sluiten_code',
        'fct_verkoopprijs': 'verkoopprijs_dbc',
        'psc_dbc_prestatiecode': 'prestatiecode',
        'psc_declaratiecode': 'declaratiecode',
        'fct_tarief': 'dbc_tarief',
        'fct_verrekenbedrag': 'verrekenbedrag'
    }

    for row in source:

        # convert datecolumns to appropriate format
        row['begindatum_zorgtraject'] = parse_dates(row['begindatum_zorgtraject'])
        row['einddatum_zorgtraject'] = parse_dates(row['einddatum_zorgtraject'])
        row['begindatum_dbc_traject'] = parse_dates(row['begindatum_dbc_traject'])
        row['einddatum_dbc_traject'] = parse_dates(row['einddatum_dbc_traject'])
        row['primaire_diagnose_datum'] = parse_dates(row['primaire_diagnose_datum'])

        # ensure DBC codes are filled to right length
        row['zorgtypecode'] = parse_codes(row['zorgtypecode'], 3, '_?_')
        row['primaire_diagnose_code'] = etl.getstr(row['primaire_diagnose_code']).upper()
        row['productgroepcode'] = parse_codes(row['productgroepcode'], 6, '_?_')
        row['prestatiecode'] = parse_codes(row['prestatiecode'], 12, '_?_')

        # get tinyint codes
        row['dbc_reden_sluiten_code'] = etl.getint(
            row['dbc_reden_sluiten_code'], default=0)
        row['circuitcode'] = etl.getint(
            row['circuitcode'], default=0)

        # convert geslacht into int conform COD046_NEN / Vektis
        row['geslacht'] = etl.getint(row['geslacht'], default=0)

        # diagnose_trekken_van: 'spatie' = 0, J=1
        row['primaire_diagnose_trekken_van'] = parse_boolean(
            row['primaire_diagnose_trekken_van'], default=0
        )

        # convert money values into decimals
        row['verkoopprijs_dbc'] = parse_money(row['verkoopprijs_dbc'])
        row['dbc_tarief'] = parse_money(row['dbc_tarief'])
        row['verrekenbedrag'] = parse_money(row['verrekenbedrag'])

        # derive dimension_ids
        row['stn_id'] = DIM_SUBTRAJECTNUMMER.ensure(row, name_mapping)
        row['dag_id_begindatum_zorgtraject'] = DIM_DAG.lookup(
            row, {'dag_datum': 'begindatum_zorgtraject'})
        row['dag_id_einddatum_zorgtraject'] = DIM_DAG.lookup(
            row, {'dag_datum': 'einddatum_zorgtraject'})
        row['dag_id_begindatum_subtraject'] = DIM_DAG.lookup(
            row, {'dag_datum': 'begindatum_dbc_traject'})
        row['dag_id_einddatum_subtraject'] = DIM_DAG.lookup(
            row, {'dag_datum': 'einddatum_dbc_traject'})
        row['dag_id_diagnose'] = DIM_DAG.lookup(
            row, {'dag_datum': 'primaire_diagnose_datum'})
        row['zgt_id'] = DIM_ZORGTYPE.ensure(row, name_mapping)
        row['dia_id_primair'] = DIM_DIAGNOSE.ensure(row, name_mapping)
        row['prg_id'] = DIM_PRODUCTGROEP.ensure(row, name_mapping)
        row['psc_id'] = DIM_PRESTATIECODE.ensure(row, name_mapping)
        row['cct_id'] = DIM_CIRCUIT.ensure(row, name_mapping)
        row['afs_id'] = DIM_AFSLUITREDEN.ensure(row, name_mapping)
        row['lnd_id'] = DIM_LAND.ensure(row, name_mapping)

        # insert fact table
        FCT_SUBTRAJECT.insert(row, name_mapping)

    connection.commit()

    end_s = time.time()
    endtime = time.localtime()
    print('{} - Finished processing {}'.
          format(time.strftime('%H:%M:%S', endtime), file))
    print('           Processing time: %0.2f seconds ' % (end_s - start_s))