def data_loader(self, input_file_path): """ Loads an input data file using its path and the source argument of the config file. :param input_file_path: path to the data. :return: pandas DataFrame object of the fully loaded datafile. """ source = self.config['general']['source'] format_error_str = ("It's most likely, that your dates are formatted " "inconsistently. Make sure to visit " "http://strftime.org/ for the correct date format " "specs.") if source == 'dataframe': if isinstance(self.df_input, pd.core.frame.DataFrame): try: return dataframe.parse_dataframe(self.config, self.df_input) except: self.printer("We couldn't transform the dataframe. %s" "\n\nTRACEBACK:\n\n%s" % (format_error_str, traceback.format_exc())) else: self.printer("Source in the config file is chosen as " "dataframe, you have to provide a pandas' " "DataFrame object as df_input of the driver.") elif source == 'csv': try: return csv.read_csv(self.config, input_file_path) except: self.printer("We couldn't load the following file: %s. %s" "\n\nTRACEBACK:\n\n%s" % (input_file_path, format_error_str, traceback.format_exc())) elif source == 'rds': try: return rds.read_rds(self.config, input_file_path) except: self.printer("We couldn't load the following file: %s. %s" "\n\nTRACEBACK:\n\n%s" % (input_file_path, format_error_str, traceback.format_exc())) elif source == 'feather': try: return feather.read_feather(self.config, input_file_path) except: self.printer("We couldn't load the following file: %s. %s" "\n\nTRACEBACK:\n\n%s" % (input_file_path, format_error_str, traceback.format_exc())) else: raise ValueError("We only support .csv, .rds, .feather input files " "or pandas DataFrame objects currently.")
import pytest from paqc.connectors import csv from paqc.utils.config_utils import config_open DICT_CONFIG_FLAGPROP = config_open( "paqc/tests/data/driver_dict_output_flagprop.yml")[1] # 40 @pytest.mark.parametrize("dict_config", [DICT_CONFIG_FLAGPROP]) @pytest.mark.parametrize( "df, expected, ls_faults", [ # Altered subset from v02.csv (csv.read_csv(DICT_CONFIG_FLAGPROP, "paqc/tests/data/qc40_check1.csv"), True, None), # (csv.read_csv(DICT_CONFIG_FLAGPROP, "paqc/tests/data/qc40_check2.csv"), False, ['C_11402COUNT', 'J_27370_DATE_FIRST_INDEX']), ]) def test_qc40(df, expected, ls_faults, dict_config): qc_params = dict_config['qc']['qc_params'] rpi = qc40(df, dict_config, qc_params['ls_metrictypes']) assert (rpi.passed == expected) & (rpi.extra == ls_faults) # 41 @pytest.mark.parametrize("dict_config", [DICT_CONFIG_FLAGPROP]) @pytest.mark.parametrize( "df, expected, ls_faults", [
DICT_CONFIG = config_open("paqc/tests/data/driver_dict_output.yml")[1] DICT_CONFIG_16 = config_open("paqc/tests/data/qc16_driver_dict_output.yml")[1] DICT_CONFIG_17TO19 = config_open( "paqc/tests/data/qc17to19_driver_dict_output.yml")[1] DICT_CONFIG_20TO21 = config_open( "paqc/tests/data/qc20to21_driver_dict_output.yml")[1] # 14 @pytest.mark.parametrize("dict_config", [DICT_CONFIG]) @pytest.mark.parametrize( "df, expected, ls_faults", [ # Subset from data/qc_data.csv (csv.read_csv(DICT_CONFIG, "paqc/tests/data/qc14_check1.csv"), True, None), # row 3 is missing (csv.read_csv(DICT_CONFIG, "paqc/tests/data/qc14_check2.csv"), False, [2]) ]) def test_qc14(df, expected, ls_faults, dict_config): rpi = qc14(df, dict_config) assert (rpi.passed == expected) & (rpi.extra == ls_faults) # 15 @pytest.mark.parametrize("dict_config", [DICT_CONFIG]) @pytest.mark.parametrize( "df, expected, ls_faults", [ # Subset from data/qc_data.csv
import pytest from paqc.connectors import csv from paqc.utils.config_utils import config_open DICT_CONFIG_CN01 = config_open( "paqc/tests/data/driver_dict_output_CN01.yml")[1] # 27 @pytest.mark.parametrize("dict_config", [DICT_CONFIG_CN01]) @pytest.mark.parametrize( "df, expected", [ # Original subset from data/initial_negative.csv (csv.read_csv(DICT_CONFIG_CN01, "paqc/tests/data/qc27_check1.csv"), True), # Deleted one row (csv.read_csv(DICT_CONFIG_CN01, "paqc/tests/data/qc27_check2.csv"), False), ]) def test_qc27(df, expected, dict_config): qc_params = dict_config['qc']['qc_params'] rpi = qc27(df, dict_config, qc_params['path_file_cp02'], qc_params['pat_id_col_cp02'], qc_params['n01_match']) assert (rpi.passed == expected) # 28 @pytest.mark.parametrize("dict_config", [DICT_CONFIG_CN01]) @pytest.mark.parametrize( "df, expected",
import pytest from paqc.connectors import csv from paqc.utils.config_utils import config_open DICT_CONFIG_CS02 = config_open( "paqc/tests/data/driver_dict_output_CS02.yml")[1] # 35 @pytest.mark.parametrize("dict_config", [DICT_CONFIG_CS02]) @pytest.mark.parametrize( "df, expected, ls_faults", [ # Original subset from data/random_sample_scoring.csv (csv.read_csv(DICT_CONFIG_CS02, "paqc/tests/data/qc25_check1.csv"), True, None), # Copied two patient IDs from the cp01 file into check2 (csv.read_csv(DICT_CONFIG_CS02, "paqc/tests/data/qc25_check2.csv"), False, ['57616631', '81744431']) ]) def test_qc35(df, expected, ls_faults, dict_config): qc_params = dict_config['qc']['qc_params'] rpi = qc35(df, dict_config, qc_params['path_file_cp01'], qc_params['pat_id_col_cp01']) assert (rpi.passed == expected) & (rpi.extra == ls_faults)
from paqc.connectors import csv from paqc.utils.config_utils import config_open DICT_CONFIG_1TO8 = config_open("paqc/tests/data/driver_dict_output.yml")[1] DICT_CONFIG_9TO13 = config_open( "paqc/tests/data/qc9to13_driver_dict_output.yml")[1] # 1 @pytest.mark.parametrize("dict_config", [DICT_CONFIG_1TO8]) @pytest.mark.parametrize( "df, expected, ls_faults", [ # Original column names from data/qc_data.csv (csv.read_csv(DICT_CONFIG_1TO8, "paqc/tests/data/qc1_check1.csv"), True, None), # GENDER has trailing space, D_7931_AVG_CLAIM_CNT leading space (csv.read_csv(DICT_CONFIG_1TO8, "paqc/tests/data/qc1_check2.csv"), False, ['GENDER ', ' D_7931_AVG_CLAIM_CNT']), # Second column name is empty (csv.read_csv(DICT_CONFIG_1TO8, "paqc/tests/data/qc1_check3.csv"), False, ['Unnamed: 1']), # Created column name with single $ (csv.read_csv(DICT_CONFIG_1TO8, "paqc/tests/data/qc1_check4.csv"), False, ['$']), # First column name is lab*el (csv.read_csv(DICT_CONFIG_1TO8, "paqc/tests/data/qc1_check5.csv"), False, ['lab*el']) ]) def test_qc1(df, expected, ls_faults, dict_config): rpi = qc1(df, dict_config)
import pytest from paqc.connectors import csv from paqc.utils.config_utils import config_open DICT_CONFIG_CP01 = config_open("paqc/tests/data/driver_dict_output_CP01.yml")[1] # 22 @pytest.mark.parametrize("dict_config", [DICT_CONFIG_CP01]) @pytest.mark.parametrize("df, expected, ls_faults", [ # Original column names from data/initial_pos.csv (csv.read_csv(DICT_CONFIG_CP01, "paqc/tests/data/qc22_check1.csv"), True, None), # Row 0, 1 and 2 have respectively NaN or 0 for the flag column. # flag columns are the only columns used to assess if a row has any # predictor part of CP01 (csv.read_csv(DICT_CONFIG_CP01, "paqc/tests/data/qc22_check2.csv"), False, [0, 1, 2]) ]) def test_qc22(df, expected, ls_faults, dict_config): rpi = qc22(df, dict_config) assert (rpi.passed == expected) & (rpi.extra == ls_faults) # 23 @pytest.mark.parametrize("dict_config", [DICT_CONFIG_CP01]) @pytest.mark.parametrize("df, expected, ls_faults", [ # Original column names from data/initial_pos.csv (csv.read_csv(DICT_CONFIG_CP01, "paqc/tests/data/qc23_check1.csv"), True, None),
import pytest from paqc.connectors import csv from paqc.utils.config_utils import config_open DICT_CONFIG_9TO13 = config_open( "paqc/tests/data/qc9to13_driver_dict_output.yml")[1] DICT_CONFIG_48 = config_open("paqc/tests/data/qc48_driver_dict_output.yml")[1] DICT_CONFIG_50 = config_open("paqc/tests/data/qc50_driver_dict_output.yml")[1] # 46 @pytest.mark.parametrize("dict_config", [DICT_CONFIG_9TO13]) @pytest.mark.parametrize( "df_old", [csv.read_csv(DICT_CONFIG_9TO13, "paqc/tests/data/suite2_df_old.csv")]) @pytest.mark.parametrize( "df_new, expected, ls_faults", [ # identical to suite2_df_old.csv (csv.read_csv(DICT_CONFIG_9TO13, "paqc/tests/data/qc46_check1.csv"), True, None), # A_last_exp_dt and A_first_exp_dt are changed in position (csv.read_csv(DICT_CONFIG_9TO13, "paqc/tests/data/qc46_check2.csv"), False, None), # column C_count is gone in the new dataframe, column D_count is new (csv.read_csv(DICT_CONFIG_9TO13, "paqc/tests/data/qc46_check3.csv"), False, { 'missing columns': ['C_count'], 'new columns': ['D_count'] })