Ejemplo n.º 1
0
    def data_loader(self, input_file_path):
        """
        Loads an input data file using its path and the source argument of
        the config file.

        :param input_file_path: path to the data.
        :return: pandas DataFrame object of the fully loaded datafile.
        """

        source = self.config['general']['source']
        format_error_str = ("It's most likely, that your dates are formatted "
                            "inconsistently. Make sure to visit "
                            "http://strftime.org/ for the correct date format "
                            "specs.")

        if source == 'dataframe':
            if isinstance(self.df_input, pd.core.frame.DataFrame):
                try:
                    return dataframe.parse_dataframe(self.config, self.df_input)
                except:
                    self.printer("We couldn't transform the dataframe. %s"
                                 "\n\nTRACEBACK:\n\n%s"
                                 % (format_error_str, traceback.format_exc()))
            else:
                self.printer("Source in the config file is chosen as "
                             "dataframe, you have to provide a pandas' "
                             "DataFrame object as df_input of the driver.")
        elif source == 'csv':
            try:
                return csv.read_csv(self.config, input_file_path)
            except:
                self.printer("We couldn't load the following file: %s. %s"
                             "\n\nTRACEBACK:\n\n%s"
                             % (input_file_path, format_error_str,
                                traceback.format_exc()))
        elif source == 'rds':
            try:
                return rds.read_rds(self.config, input_file_path)
            except:
                self.printer("We couldn't load the following file: %s. %s"
                             "\n\nTRACEBACK:\n\n%s"
                             % (input_file_path, format_error_str,
                                traceback.format_exc()))
        elif source == 'feather':
            try:
                return feather.read_feather(self.config, input_file_path)
            except:
                self.printer("We couldn't load the following file: %s. %s"
                             "\n\nTRACEBACK:\n\n%s"
                             % (input_file_path, format_error_str,
                                traceback.format_exc()))
        else:
            raise ValueError("We only support .csv, .rds, .feather input files "
                             "or pandas DataFrame objects currently.")
Ejemplo n.º 2
0
import pytest

from paqc.connectors import csv
from paqc.utils.config_utils import config_open

DICT_CONFIG_FLAGPROP = config_open(
    "paqc/tests/data/driver_dict_output_flagprop.yml")[1]


# 40
@pytest.mark.parametrize("dict_config", [DICT_CONFIG_FLAGPROP])
@pytest.mark.parametrize(
    "df, expected, ls_faults",
    [
        # Altered subset from v02.csv
        (csv.read_csv(DICT_CONFIG_FLAGPROP,
                      "paqc/tests/data/qc40_check1.csv"), True, None),
        #
        (csv.read_csv(DICT_CONFIG_FLAGPROP, "paqc/tests/data/qc40_check2.csv"),
         False, ['C_11402COUNT', 'J_27370_DATE_FIRST_INDEX']),
    ])
def test_qc40(df, expected, ls_faults, dict_config):
    qc_params = dict_config['qc']['qc_params']
    rpi = qc40(df, dict_config, qc_params['ls_metrictypes'])
    assert (rpi.passed == expected) & (rpi.extra == ls_faults)


# 41
@pytest.mark.parametrize("dict_config", [DICT_CONFIG_FLAGPROP])
@pytest.mark.parametrize(
    "df, expected, ls_faults",
    [
Ejemplo n.º 3
0
DICT_CONFIG = config_open("paqc/tests/data/driver_dict_output.yml")[1]
DICT_CONFIG_16 = config_open("paqc/tests/data/qc16_driver_dict_output.yml")[1]
DICT_CONFIG_17TO19 = config_open(
    "paqc/tests/data/qc17to19_driver_dict_output.yml")[1]
DICT_CONFIG_20TO21 = config_open(
    "paqc/tests/data/qc20to21_driver_dict_output.yml")[1]


# 14
@pytest.mark.parametrize("dict_config", [DICT_CONFIG])
@pytest.mark.parametrize(
    "df, expected, ls_faults",
    [
        # Subset from data/qc_data.csv
        (csv.read_csv(DICT_CONFIG,
                      "paqc/tests/data/qc14_check1.csv"), True, None),
        # row 3 is missing
        (csv.read_csv(DICT_CONFIG,
                      "paqc/tests/data/qc14_check2.csv"), False, [2])
    ])
def test_qc14(df, expected, ls_faults, dict_config):
    rpi = qc14(df, dict_config)
    assert (rpi.passed == expected) & (rpi.extra == ls_faults)


# 15
@pytest.mark.parametrize("dict_config", [DICT_CONFIG])
@pytest.mark.parametrize(
    "df, expected, ls_faults",
    [
        # Subset from data/qc_data.csv
Ejemplo n.º 4
0
import pytest

from paqc.connectors import csv
from paqc.utils.config_utils import config_open

DICT_CONFIG_CN01 = config_open(
    "paqc/tests/data/driver_dict_output_CN01.yml")[1]


# 27
@pytest.mark.parametrize("dict_config", [DICT_CONFIG_CN01])
@pytest.mark.parametrize(
    "df, expected",
    [
        # Original subset from data/initial_negative.csv
        (csv.read_csv(DICT_CONFIG_CN01,
                      "paqc/tests/data/qc27_check1.csv"), True),
        # Deleted one row
        (csv.read_csv(DICT_CONFIG_CN01,
                      "paqc/tests/data/qc27_check2.csv"), False),
    ])
def test_qc27(df, expected, dict_config):
    qc_params = dict_config['qc']['qc_params']
    rpi = qc27(df, dict_config, qc_params['path_file_cp02'],
               qc_params['pat_id_col_cp02'], qc_params['n01_match'])
    assert (rpi.passed == expected)


# 28
@pytest.mark.parametrize("dict_config", [DICT_CONFIG_CN01])
@pytest.mark.parametrize(
    "df, expected",
Ejemplo n.º 5
0
import pytest

from paqc.connectors import csv
from paqc.utils.config_utils import config_open

DICT_CONFIG_CS02 = config_open(
    "paqc/tests/data/driver_dict_output_CS02.yml")[1]


# 35
@pytest.mark.parametrize("dict_config", [DICT_CONFIG_CS02])
@pytest.mark.parametrize(
    "df, expected, ls_faults",
    [
        # Original subset from data/random_sample_scoring.csv
        (csv.read_csv(DICT_CONFIG_CS02,
                      "paqc/tests/data/qc25_check1.csv"), True, None),
        # Copied two patient IDs from the cp01 file into check2
        (csv.read_csv(DICT_CONFIG_CS02, "paqc/tests/data/qc25_check2.csv"),
         False, ['57616631', '81744431'])
    ])
def test_qc35(df, expected, ls_faults, dict_config):
    qc_params = dict_config['qc']['qc_params']
    rpi = qc35(df, dict_config, qc_params['path_file_cp01'],
               qc_params['pat_id_col_cp01'])
    assert (rpi.passed == expected) & (rpi.extra == ls_faults)
Ejemplo n.º 6
0
from paqc.connectors import csv
from paqc.utils.config_utils import config_open

DICT_CONFIG_1TO8 = config_open("paqc/tests/data/driver_dict_output.yml")[1]
DICT_CONFIG_9TO13 = config_open(
    "paqc/tests/data/qc9to13_driver_dict_output.yml")[1]


# 1
@pytest.mark.parametrize("dict_config", [DICT_CONFIG_1TO8])
@pytest.mark.parametrize(
    "df, expected, ls_faults",
    [
        # Original column names from data/qc_data.csv
        (csv.read_csv(DICT_CONFIG_1TO8,
                      "paqc/tests/data/qc1_check1.csv"), True, None),
        # GENDER has trailing space, D_7931_AVG_CLAIM_CNT leading space
        (csv.read_csv(DICT_CONFIG_1TO8, "paqc/tests/data/qc1_check2.csv"),
         False, ['GENDER ', ' D_7931_AVG_CLAIM_CNT']),
        # Second column name is empty
        (csv.read_csv(DICT_CONFIG_1TO8, "paqc/tests/data/qc1_check3.csv"),
         False, ['Unnamed: 1']),
        # Created column name with single $
        (csv.read_csv(DICT_CONFIG_1TO8,
                      "paqc/tests/data/qc1_check4.csv"), False, ['$']),
        # First column name is lab*el
        (csv.read_csv(DICT_CONFIG_1TO8,
                      "paqc/tests/data/qc1_check5.csv"), False, ['lab*el'])
    ])
def test_qc1(df, expected, ls_faults, dict_config):
    rpi = qc1(df, dict_config)
Ejemplo n.º 7
0
import pytest

from paqc.connectors import csv
from paqc.utils.config_utils import config_open

DICT_CONFIG_CP01 = config_open("paqc/tests/data/driver_dict_output_CP01.yml")[1]


# 22
@pytest.mark.parametrize("dict_config", [DICT_CONFIG_CP01])
@pytest.mark.parametrize("df, expected, ls_faults", [
    # Original column names from data/initial_pos.csv
    (csv.read_csv(DICT_CONFIG_CP01, "paqc/tests/data/qc22_check1.csv"),
     True, None),
    # Row 0, 1 and 2 have respectively NaN or 0 for the flag column.
    # flag columns are the only columns used to assess if a row has any
    # predictor part of CP01
    (csv.read_csv(DICT_CONFIG_CP01, "paqc/tests/data/qc22_check2.csv"),
     False, [0, 1, 2])
])
def test_qc22(df, expected, ls_faults, dict_config):
    rpi = qc22(df, dict_config)
    assert (rpi.passed == expected) & (rpi.extra == ls_faults)


# 23
@pytest.mark.parametrize("dict_config", [DICT_CONFIG_CP01])
@pytest.mark.parametrize("df, expected, ls_faults", [
    # Original column names from data/initial_pos.csv
    (csv.read_csv(DICT_CONFIG_CP01, "paqc/tests/data/qc23_check1.csv"),
     True, None),
Ejemplo n.º 8
0
import pytest

from paqc.connectors import csv
from paqc.utils.config_utils import config_open

DICT_CONFIG_9TO13 = config_open(
    "paqc/tests/data/qc9to13_driver_dict_output.yml")[1]
DICT_CONFIG_48 = config_open("paqc/tests/data/qc48_driver_dict_output.yml")[1]
DICT_CONFIG_50 = config_open("paqc/tests/data/qc50_driver_dict_output.yml")[1]


# 46
@pytest.mark.parametrize("dict_config", [DICT_CONFIG_9TO13])
@pytest.mark.parametrize(
    "df_old",
    [csv.read_csv(DICT_CONFIG_9TO13, "paqc/tests/data/suite2_df_old.csv")])
@pytest.mark.parametrize(
    "df_new, expected, ls_faults",
    [
        # identical to suite2_df_old.csv
        (csv.read_csv(DICT_CONFIG_9TO13,
                      "paqc/tests/data/qc46_check1.csv"), True, None),
        # A_last_exp_dt and A_first_exp_dt are changed in position
        (csv.read_csv(DICT_CONFIG_9TO13,
                      "paqc/tests/data/qc46_check2.csv"), False, None),
        # column C_count is gone in the new dataframe, column D_count is new
        (csv.read_csv(DICT_CONFIG_9TO13,
                      "paqc/tests/data/qc46_check3.csv"), False, {
                          'missing columns': ['C_count'],
                          'new columns': ['D_count']
                      })