def main(): for row in inputdata: extractdomaininfo(row) extractserverinfo(row) row["size"] = pygrametl.getint(row["size"]) # Convert to an int # Add the data to the dimension tables and the fact table row["pageid"] = pagesf.scdensure(row) row["dateid"] = datedim.ensure(row, {"date": "downloaddate"}) row["testid"] = testdim.lookup(row, {"testname": "test"}) facttbl.insert(row) connection.commit()
def main(): for row in inputdata: extractdomaininfo(row) extractserverinfo(row) row['size'] = pygrametl.getint(row['size']) # Convert to an int # Add the data to the dimension tables and the fact table row['pageid'] = pagesf.scdensure(row) row['dateid'] = datedim.ensure(row, {'date':'downloaddate'}) row['testid'] = testdim.lookup(row, {'testname':'test'}) facttbl.insert(row) connection.commit()
def main(): for row in inputdata: extractdomaininfo(row) extractserverinfo(row) row['size'] = pygrametl.getint(row['size']) # Convert to an int # Add the data to the dimension tables and the fact table row['pageid'] = pagesf.scdensure(row) row['dateid'] = datedim.ensure(row, {'date': 'downloaddate'}) row['testid'] = testdim.lookup(row, {'testname': 'test'}) facttbl.insert(row) connection.commit()
def load_str_dot(file, config): """Method for loading one subtraject file of WOB ZZ DOT Main ETL method for WOB ZZ subtrajecten. Requires active pygrametl connection as global. """ global connection paths = {'data_path': config.get('wob_zz', 'data_path'), 'staging_path': config.get('wob_zz', 'staging_path')} names_STR = ['datum_aanmaak', 'landcode', 'geslacht', 'verwijzend_specialisme', 'zorgtrajectnummer', 'zorgtrajectnummer_parent', 'begindatum_zorgtraject', 'einddatum_zorgtraject', 'declaratiedatasetnummer', 'subtrajectnummer', 'subtraject_id', 'declaratiecode', 'behandelend_specialisme', 'zorgtypecode', 'zorgvraagcode', 'typerende_diagnose', 'icd10_vertaling_diagnose', 'hoofdtraject_indicatie', 'zorgproductcode', 'dbc_reden_sluiten', 'aanspraak_zvw', 'aanspraak_zvw_toegepast', 'zorgact_met_machtiging', 'oranje_zorgactiviteit', 'zorgactiviteitvertaling_toegepast', 'begindatum_subtraject', 'einddatum_subtraject', 'declaratiedatum', 'dbc_ziekenhuiskosten', 'honorarium_totaal'] source_file = bz2.open(paths['data_path'] + '/' + file, mode='rt') source = csv.DictReader(source_file, delimiter=';', quotechar='"', fieldnames=names_STR) starttime = time.localtime() start_s = time.time() print('{} - Start processing file: {}'. format(time.strftime('%H:%M:%S', starttime), file)) name_mapping = { 'afl_afsluitreden_code' : 'dbc_reden_sluiten', #'beh_dbc_specialisme_code' : 'behandelend_specialisme', #'beh_dbc_behandeling_code' : 'behandelcode', 'dcl_declaratie_code' : 'declaratiecode', 'dia_dbc_specialisme_code' : 'behandelend_specialisme', 'dia_dbc_diagnose_code' : 'typerende_diagnose', 'geslacht' : 'geslacht', 'heeft_zorgactiviteit_met_machtiging': 'zorgact_met_machtiging', 'heeft_oranje_zorgactiviteit' : 'oranje_zorgactiviteit', 'is_aanspraak_zvw' : 'aanspraak_zvw', 'is_aanspraak_zvw_toegepast' : 'aanspraak_zvw_toegepast', 'is_hoofdtraject' : 'hoofdtraject_indicatie', 'is_zorgactiviteitvertaling_toegepast': 'zorgactiviteitvertaling_toegepast', 'lnd_land_code' : 'landcode', 'stn_subtraject_id' : 'subtraject_id', 'stn_subtrajectnummer' : 'subtrajectnummer', 'stn_zorgtrajectnummer' : 'zorgtrajectnummer', 'stn_zorgtrajectnummer_parent' : 'zorgtrajectnummer_parent', 'zgt_dbc_specialisme_code' : 'behandelend_specialisme', 'zgt_dbc_zorgtype_code' : 'zorgtypecode', 'zgv_dbc_specialisme_code' : 'behandelend_specialisme', 'zgv_dbc_zorgvraag_code' : 'zorgvraagcode', 'zpr_dbc_zorgproduct_code' : 'zorgproductcode', 'fct_omzet_ziekenhuis' : 'dbc_ziekenhuiskosten', 'fct_omzet_honorarium_totaal' : 'honorarium_totaal', } for row in source: # convert datecolumns to appropriate format row['begindatum_zorgtraject'] = parse_dates(row['begindatum_zorgtraject']) row['einddatum_zorgtraject'] = parse_dates(row['einddatum_zorgtraject']) row['begindatum_subtraject'] = parse_dates(row['begindatum_subtraject']) row['einddatum_subtraject'] = parse_dates(row['einddatum_subtraject']) row['declaratiedatum'] = parse_dates(row['declaratiedatum']) # ensure DBC codes are filled to right length row['verwijzend_specialisme'] = \ parse_codes(row['verwijzend_specialisme'], 4, '_?_') row['behandelend_specialisme'] = \ parse_codes(row['behandelend_specialisme'], 4, '_?_') row['zorgtypecode'] = parse_codes(row['zorgtypecode'], 2, '??') row['zorgvraagcode'] = parse_codes(row['zorgvraagcode'], 4, '_?_') row['typerende_diagnose'] = parse_codes(row['typerende_diagnose'], 4, '_?_') row['zorgproductcode'] = parse_codes(row['zorgproductcode'], 9, '_?_') # convert geslacht into int conform COD046_NEN / Vektis row['geslacht'] = etl.getint(row['geslacht'], default=0) # convert booleans row['hoofdtraject_indicatie'] = parse_boolean(row['hoofdtraject_indicatie']) row['aanspraak_zvw'] = parse_boolean(row['aanspraak_zvw']) row['aanspraak_zvw_toegepast'] = parse_boolean(row['aanspraak_zvw_toegepast']) row['oranje_zorgactiviteit'] = parse_boolean(row['oranje_zorgactiviteit']) row['zorgact_met_machtiging'] = parse_boolean(row['zorgact_met_machtiging']) row['zorgactiviteitvertaling_toegepast'] = parse_boolean(row['zorgactiviteitvertaling_toegepast']) # convert money values into decimals row['dbc_ziekenhuiskosten'] = parse_money(row['dbc_ziekenhuiskosten']) row['honorarium_totaal'] = parse_money(row['honorarium_totaal']) # derive dimension_ids row['beh_id'] = -1 # no behandelcodes in DOT per 2012-01-01 row['dag_id_begindatum_zorgtraject'] = DIM_DAG.lookup(row, {'dag_datum': 'begindatum_zorgtraject'}) row['dag_id_einddatum_zorgtraject'] = DIM_DAG.lookup(row, {'dag_datum': 'einddatum_zorgtraject'}) row['dag_id_begindatum_subtraject'] = DIM_DAG.lookup(row, {'dag_datum': 'begindatum_subtraject'}) row['dag_id_einddatum_subtraject'] = DIM_DAG.lookup(row, {'dag_datum': 'einddatum_subtraject'}) row['dag_id_declaratiedatum'] = DIM_DAG.lookup(row, {'dag_datum': 'declaratiedatum'}) row['dia_id'] = DIM_DIAGNOSE.ensure(row, name_mapping) row['stn_id'] = DIM_SUBTRAJECTNUMMER.ensure(row, name_mapping) row['zgt_id'] = DIM_ZORGTYPE.ensure(row, name_mapping) row['zgv_id'] = DIM_ZORGVRAAG.ensure(row, name_mapping) row['zpr_id'] = DIM_ZORGPRODUCT.ensure(row, name_mapping) row['zvs_id_behandelend'] = \ DIM_ZORGVERLENERSOORT.ensure(row, {'zvs_vektis_zorgverlenersoort_code': 'behandelend_specialisme'}) row['zvs_id_verwijzend'] = \ DIM_ZORGVERLENERSOORT.ensure(row, {'zvs_vektis_zorgverlenersoort_code': 'verwijzend_specialisme'}) # insert fact table FCT_SUBTRAJECT.insert(row, name_mapping) connection.commit() end_s = time.time() endtime = time.localtime() print('{} - Finished processing {}'. format(time.strftime('%H:%M:%S', endtime), file)) print(' Processing time: %0.2f seconds ' % (end_s - start_s))
def convertsize(row): row['size'] = pygrametl.getint(row['size'])
def load_subtraject(file, config): """Method for loading one subtraject file of WOB ZZ DOT Main ETL method for WOB GGZ subtrajecten. Requires active pygrametl connection as global. """ global connection paths = { 'data_path': config.get('wob_ggz', 'data_path'), 'staging_path': config.get('wob_ggz', 'staging_path') } column_names = [ 'datum_aanmaak', 'geslacht', 'landcode', 'zorgtrajectnummer', 'begindatum_zorgtraject', 'einddatum_zorgtraject', 'primaire_diagnose_code', 'primaire_diagnose_trekken_van', 'primaire_diagnose_datum', 'dbc_trajectnummer', 'zorgtypecode', 'circuitcode', 'productgroepcode', 'begindatum_dbc_traject', 'einddatum_dbc_traject', 'dbc_reden_sluiten_code', 'verkoopprijs_dbc', 'prestatiecode', 'declaratiecode', 'dbc_tarief', 'verrekenbedrag' ] source_file = bz2.open(paths['data_path'] + '/' + file, mode='rt', encoding='cp1252') source = csv.DictReader(source_file, delimiter=';', quotechar='"', fieldnames=column_names) starttime = time.localtime() start_s = time.time() print('{} - Start processing file: {}'.format( time.strftime('%H:%M:%S', starttime), file)) name_mapping = { 'geslacht': 'geslacht', 'lnd_land_code': 'landcode', 'stn_zorgtrajectnummer': 'zorgtrajectnummer', 'dia_diagnose_code': 'primaire_diagnose_code', 'diagnose_trekken_van': 'primaire_diagnose_trekken_van', 'stn_subtrajectnummer': 'dbc_trajectnummer', 'zgt_zorgtype_code': 'zorgtypecode', 'cct_circuit_code': 'circuitcode', 'prg_productgroep_code': 'productgroepcode', 'afs_afsluitreden_code': 'dbc_reden_sluiten_code', 'fct_verkoopprijs': 'verkoopprijs_dbc', 'psc_dbc_prestatiecode': 'prestatiecode', 'psc_declaratiecode': 'declaratiecode', 'fct_tarief': 'dbc_tarief', 'fct_verrekenbedrag': 'verrekenbedrag' } for row in source: # convert datecolumns to appropriate format row['begindatum_zorgtraject'] = parse_dates( row['begindatum_zorgtraject']) row['einddatum_zorgtraject'] = parse_dates( row['einddatum_zorgtraject']) row['begindatum_dbc_traject'] = parse_dates( row['begindatum_dbc_traject']) row['einddatum_dbc_traject'] = parse_dates( row['einddatum_dbc_traject']) row['primaire_diagnose_datum'] = parse_dates( row['primaire_diagnose_datum']) # ensure DBC codes are filled to right length row['zorgtypecode'] = parse_codes(row['zorgtypecode'], 3, '_?_') row['primaire_diagnose_code'] = etl.getstr( row['primaire_diagnose_code']).upper() row['productgroepcode'] = parse_codes(row['productgroepcode'], 6, '_?_') row['prestatiecode'] = parse_codes(row['prestatiecode'], 12, '_?_') # get tinyint codes row['dbc_reden_sluiten_code'] = etl.getint( row['dbc_reden_sluiten_code'], default=0) row['circuitcode'] = etl.getint(row['circuitcode'], default=0) # convert geslacht into int conform COD046_NEN / Vektis row['geslacht'] = etl.getint(row['geslacht'], default=0) # diagnose_trekken_van: 'spatie' = 0, J=1 row['primaire_diagnose_trekken_van'] = parse_boolean( row['primaire_diagnose_trekken_van'], default=0) # convert money values into decimals row['verkoopprijs_dbc'] = parse_money(row['verkoopprijs_dbc']) row['dbc_tarief'] = parse_money(row['dbc_tarief']) row['verrekenbedrag'] = parse_money(row['verrekenbedrag']) # derive dimension_ids row['stn_id'] = DIM_SUBTRAJECTNUMMER.ensure(row, name_mapping) row['dag_id_begindatum_zorgtraject'] = DIM_DAG.lookup( row, {'dag_datum': 'begindatum_zorgtraject'}) row['dag_id_einddatum_zorgtraject'] = DIM_DAG.lookup( row, {'dag_datum': 'einddatum_zorgtraject'}) row['dag_id_begindatum_subtraject'] = DIM_DAG.lookup( row, {'dag_datum': 'begindatum_dbc_traject'}) row['dag_id_einddatum_subtraject'] = DIM_DAG.lookup( row, {'dag_datum': 'einddatum_dbc_traject'}) row['dag_id_diagnose'] = DIM_DAG.lookup( row, {'dag_datum': 'primaire_diagnose_datum'}) row['zgt_id'] = DIM_ZORGTYPE.ensure(row, name_mapping) row['dia_id_primair'] = DIM_DIAGNOSE.ensure(row, name_mapping) row['prg_id'] = DIM_PRODUCTGROEP.ensure(row, name_mapping) row['psc_id'] = DIM_PRESTATIECODE.ensure(row, name_mapping) row['cct_id'] = DIM_CIRCUIT.ensure(row, name_mapping) row['afs_id'] = DIM_AFSLUITREDEN.ensure(row, name_mapping) row['lnd_id'] = DIM_LAND.ensure(row, name_mapping) # insert fact table FCT_SUBTRAJECT.insert(row, name_mapping) connection.commit() end_s = time.time() endtime = time.localtime() print('{} - Finished processing {}'.format( time.strftime('%H:%M:%S', endtime), file)) print(' Processing time: %0.2f seconds ' % (end_s - start_s))
def load_subtraject(file, config): """Method for loading one subtraject file of WOB ZZ DOT Main ETL method for WOB GGZ subtrajecten. Requires active pygrametl connection as global. """ global connection paths = {'data_path': config.get('wob_ggz', 'data_path'), 'staging_path': config.get('wob_ggz', 'staging_path')} column_names = ['datum_aanmaak', 'geslacht', 'landcode', 'zorgtrajectnummer', 'begindatum_zorgtraject', 'einddatum_zorgtraject', 'primaire_diagnose_code', 'primaire_diagnose_trekken_van', 'primaire_diagnose_datum', 'dbc_trajectnummer', 'zorgtypecode', 'circuitcode', 'productgroepcode', 'begindatum_dbc_traject', 'einddatum_dbc_traject', 'dbc_reden_sluiten_code', 'verkoopprijs_dbc', 'prestatiecode', 'declaratiecode', 'dbc_tarief', 'verrekenbedrag' ] source_file = bz2.open(paths['data_path'] + '/' + file, mode='rt', encoding='cp1252') source = csv.DictReader(source_file, delimiter=';', quotechar='"', fieldnames=column_names) starttime = time.localtime() start_s = time.time() print('{} - Start processing file: {}'. format(time.strftime('%H:%M:%S', starttime), file)) name_mapping = { 'geslacht': 'geslacht', 'lnd_land_code': 'landcode', 'stn_zorgtrajectnummer': 'zorgtrajectnummer', 'dia_diagnose_code': 'primaire_diagnose_code', 'diagnose_trekken_van': 'primaire_diagnose_trekken_van', 'stn_subtrajectnummer': 'dbc_trajectnummer', 'zgt_zorgtype_code': 'zorgtypecode', 'cct_circuit_code': 'circuitcode', 'prg_productgroep_code': 'productgroepcode', 'afs_afsluitreden_code': 'dbc_reden_sluiten_code', 'fct_verkoopprijs': 'verkoopprijs_dbc', 'psc_dbc_prestatiecode': 'prestatiecode', 'psc_declaratiecode': 'declaratiecode', 'fct_tarief': 'dbc_tarief', 'fct_verrekenbedrag': 'verrekenbedrag' } for row in source: # convert datecolumns to appropriate format row['begindatum_zorgtraject'] = parse_dates(row['begindatum_zorgtraject']) row['einddatum_zorgtraject'] = parse_dates(row['einddatum_zorgtraject']) row['begindatum_dbc_traject'] = parse_dates(row['begindatum_dbc_traject']) row['einddatum_dbc_traject'] = parse_dates(row['einddatum_dbc_traject']) row['primaire_diagnose_datum'] = parse_dates(row['primaire_diagnose_datum']) # ensure DBC codes are filled to right length row['zorgtypecode'] = parse_codes(row['zorgtypecode'], 3, '_?_') row['primaire_diagnose_code'] = etl.getstr(row['primaire_diagnose_code']).upper() row['productgroepcode'] = parse_codes(row['productgroepcode'], 6, '_?_') row['prestatiecode'] = parse_codes(row['prestatiecode'], 12, '_?_') # get tinyint codes row['dbc_reden_sluiten_code'] = etl.getint( row['dbc_reden_sluiten_code'], default=0) row['circuitcode'] = etl.getint( row['circuitcode'], default=0) # convert geslacht into int conform COD046_NEN / Vektis row['geslacht'] = etl.getint(row['geslacht'], default=0) # diagnose_trekken_van: 'spatie' = 0, J=1 row['primaire_diagnose_trekken_van'] = parse_boolean( row['primaire_diagnose_trekken_van'], default=0 ) # convert money values into decimals row['verkoopprijs_dbc'] = parse_money(row['verkoopprijs_dbc']) row['dbc_tarief'] = parse_money(row['dbc_tarief']) row['verrekenbedrag'] = parse_money(row['verrekenbedrag']) # derive dimension_ids row['stn_id'] = DIM_SUBTRAJECTNUMMER.ensure(row, name_mapping) row['dag_id_begindatum_zorgtraject'] = DIM_DAG.lookup( row, {'dag_datum': 'begindatum_zorgtraject'}) row['dag_id_einddatum_zorgtraject'] = DIM_DAG.lookup( row, {'dag_datum': 'einddatum_zorgtraject'}) row['dag_id_begindatum_subtraject'] = DIM_DAG.lookup( row, {'dag_datum': 'begindatum_dbc_traject'}) row['dag_id_einddatum_subtraject'] = DIM_DAG.lookup( row, {'dag_datum': 'einddatum_dbc_traject'}) row['dag_id_diagnose'] = DIM_DAG.lookup( row, {'dag_datum': 'primaire_diagnose_datum'}) row['zgt_id'] = DIM_ZORGTYPE.ensure(row, name_mapping) row['dia_id_primair'] = DIM_DIAGNOSE.ensure(row, name_mapping) row['prg_id'] = DIM_PRODUCTGROEP.ensure(row, name_mapping) row['psc_id'] = DIM_PRESTATIECODE.ensure(row, name_mapping) row['cct_id'] = DIM_CIRCUIT.ensure(row, name_mapping) row['afs_id'] = DIM_AFSLUITREDEN.ensure(row, name_mapping) row['lnd_id'] = DIM_LAND.ensure(row, name_mapping) # insert fact table FCT_SUBTRAJECT.insert(row, name_mapping) connection.commit() end_s = time.time() endtime = time.localtime() print('{} - Finished processing {}'. format(time.strftime('%H:%M:%S', endtime), file)) print(' Processing time: %0.2f seconds ' % (end_s - start_s))