Ejemplo n.º 1
0
    def mark_duplicates(cls, df_spark, duplicate_dict):
        """
    Mark each row if duplicates found based on a primary key

    @input
    spark -- SparkSession
    filename -- string 
    df_spark -- dictionary of spark dataframe
    duplicate_dict -- dictionary of primary keys for each table
    @output
    Spark Dataframe
    """
        QC_LOG_FORMAT = '%(rule)s - %(total)s - %(count)s - %(percent)s'
        for filename in duplicate_dict:
            list = duplicate_dict[filename]
            df_spark[filename] = df_spark[filename].withColumn(
                "count",
                SQL.count(list[0]).over(Window.partitionBy(list))).withColumn(
                    "flag_duplication",
                    SQL.when(SQL.col("count") > 1,
                             1).otherwise(0)).drop("count")
            dup_total = df_spark[filename].count()
            dup_count = df_spark[filename].where(
                SQL.col("flag_duplication") == 1).count()
            dup_percent = round(float(dup_count) / dup_total * 100, 2)
            message = "mark_duplicate - {} - {} - {} - {}".format(
                filename, dup_total, dup_count, dup_percent)
            log.info(message)
        return df_spark
Ejemplo n.º 2
0
    def qc_lastdate(cls, dfs, last_dt):
        """
                Check if there are date values in that are greater than last possible date for a patient based on last_dt specs.
        """

        print("\nCheck dates that come after last date values...")
        output_dict = {}
        dt_type = 'datetime64[ns]'
        if last_dt['dt_type'] == 'float':
            dt_type = 'float64'
        lastdt_df = dfs[last_dt['table']][[last_dt['id_column'], last_dt['dt_column']]].drop_duplicates()
        lastdt_df[last_dt['dt_column']] = lastdt_df[last_dt['dt_column']].astype(dt_type)
        for name, df in dfs.items():
            df = dfs[name].copy()
            if not last_dt["ignore_value"]:
                cols = [c for c in df.columns if last_dt["dt_value"] in c]
            else:
                cols = [c for c in df.columns if last_dt["dt_value"] in c and last_dt["ignore_value"] not in c]
            if last_dt['dt_column'] in cols:
                cols.remove(last_dt['dt_column'])
                df = df.drop(columns=last_dt['dt_column'], axis=1)
            if last_dt['ignorecols'] is not None:
                for col in last_dt['ignorecols']:
                    if col in cols:
                        cols.remove(col)
                        df = df.drop(columns=col, axis=1)
            if not cols:
                continue
            df[cols] = df[cols].replace('UK', '15', regex=True)
            for col in cols:
                df[col] = df[col].astype(dt_type)
            inter_df = df.merge(lastdt_df, how='outer', on=last_dt['id_column'])
            querystring = inequality_dt_query(last_dt['dt_column'], cols, '<', '|')
            df_last_dt_obs = inter_df.query(querystring, engine='python').assign(last_dt_flag=1)
            dfs[name] = inter_df.merge(df_last_dt_obs, how='outer', on=list(inter_df.columns))
            dfs[name]["last_dt_flag"] = dfs[name]["last_dt_flag"].fillna(0)
            dfs[name] = dfs[name].drop(columns=[last_dt['dt_column']], axis=1)

            if df_last_dt_obs.shape[0] != 0:
                print("\t Table: ", name, ">>>FAIL")
                output_dict[name] = df_last_dt_obs
                csv_name = 'qc_lastdt_' + name + '.csv'
                report_path = os.path.join(cls.output_loc, csv_name)
                file_to_write = pdToStringIO(df_last_dt_obs)
                DataIO.write(DataIO.RepoType.LOCAL, 'file://' + report_path, content=file_to_write, query=None,
                             username=None,
                             password=None, write_type=DataIO.WriteType.NEW)

                #f_last_dt_obs.to_csv(report_path, index=False)
                message = "qc_lastdate - {} - {}".format(name, report_path)
                log.info(message)
                print("report location for {}: {}".format(name, report_path))

        print("...Done")
        return dfs
Ejemplo n.º 3
0
    def qc_dup(cls, dfs, dup_dict):
        """
      Check duplications for specific columns. Duplication is defined as multiple
      observations on the same day same patient.

      :param dict dfs: A dictionary that stores dataframe as the value and name
      as the key.
      :param dict dup_dict: Specify which tables and columns to check. Table name
      is as key and the list of duplicated keys as values.

      :return: print summary and csv output

      :example:
      dup_dict = {
          'Stem_Cell_Transplant': ['PATCOD','SCTYN','SCTDAT'],
          'Serum_M-Protein': ['PATCOD','LBDAT']}

      """
        print("\nCheck duplicated values...")

        output_dict = {}
        report_dict = {}
        for name, cols in dup_dict.items():
            print("\t Table: ", name)
            df = dfs[name]
            df_dup_cts = return_multiple_records(df, cols)
            rule_obj = UniquePKRule(dfs[name], cols)
            rule_obj.result_message(rule_obj.validation_result)
            rule_event_obj = rule_obj.result_to_event("QC_Engine", "001")
            if df_dup_cts.shape[0] != 0:
                print("\t >>>FAIL")
                key = name + ' : ' + ','.join(cols)
                df_dup_obs = df_dup_cts.merge(df, how='inner', on=cols)
                final_df = df_dup_cts.rename(columns={"counts": "dup_count"}).assign(dup_flag=1).merge(df, how='outer',
                                                                                                       on=cols).fillna(0)

                dup_total = final_df['dup_flag'].sum()
                total = len(final_df.index)
                dup_percent = round(float(dup_total) / total * 100, 2)

                report_dict[name] = df_dup_obs
                output_dict[name] = final_df
                dfs[name] = final_df
                csv_name = 'qc_dup_' + name + '.csv'
                report_path = os.path.join(cls.output_loc, csv_name)
                file_to_write = pdToStringIO(df_dup_obs)
                DataIO.write(DataIO.RepoType.LOCAL, 'file://'+report_path, content=file_to_write, query=None, username=None, password=None, write_type=DataIO.WriteType.NEW)
                #df_dup_obs.to_csv(report_path, index=False)

                message = "mark_duplicate - {} - {} - {} - {} - {}".format(name, total, dup_total, dup_percent, report_path)
                log.info(message)
                print("report location for {}: {}".format(name, report_path))

        print("...Done")
        return dfs
Ejemplo n.º 4
0
    def qc_string_in_num(cls, dfs, string_tables, outdir=None):
        """
        Check if there are string values in numeric fields. If there are NAs in the
        filed the function doesn't count.
        """

        print("\nCheck string values in numeric fields...")
        output_list = []
        string_dict = {}

        for name, df in dfs.items():
            print("\n" + name)
            cols = [c for c in df.columns if string_tables["value"] in c]
            print(cols)
        for name, df in dfs.items():
            df = dfs[name].copy()
            cols = [c for c in df.columns if string_tables["value"] in c]
            string_dict[name] = cols
            if not cols:
                continue
            for col in cols:
                col_num = col + '_num'
                df[col_num] = df[col]
                bool_df = df[col_num].astype(str).apply(lambda v: is_not_positive_digit(v))
                df[col_num] = bool_df
                strs = df.loc[bool_df, col].unique()
                strs = [s for s in strs if s == s]

                if len(strs) != 0:
                    output_list.append([name, col, strs])
            cols_num = (s + '_num' for s in cols)
            df = df[df.select_dtypes([bool]).any(1)].drop(columns=cols_num, axis=1).assign(value_flag=1)
            dfs[name] = dfs[name].merge(df, how='outer', on=list(dfs[name].columns))
            dfs[name]["value_flag"] = dfs[name]["value_flag"].fillna(0)
        output_df = pd.DataFrame(
            output_list,
            columns=['tbl', 'column', 'strings'])

        report_path = os.path.join(cls.output_loc, 'qc_value.csv')
        file_to_write = pdToStringIO(output_df)
        DataIO.write(DataIO.RepoType.LOCAL, 'file://' + report_path, content=file_to_write, query=None, username=None,
                     password=None, write_type=DataIO.WriteType.NEW)

        #output_df.to_csv(report_path, index=False)
        message = "check_value - {}".format(report_path)
        log.info(message)
        print("report location: {}".format(report_path))
        print("...Done")
        return dfs
Ejemplo n.º 5
0
    def qc_missing(cls, dfs, missing_dict=None, outdir=None):
        """
      Return the summary of missing values for all tables, all columns
      """
        print("\nCheck missing values...")
        df_list = []
        output_dict = {}

        for name, df in dfs.items():
            df_cols = list(df.columns)
            if name in missing_dict:
                cols = missing_dict[name]
            else:
                cols = df_cols
            filterdf = df.replace(r'^\s*$', np.nan, regex=True).dropna(axis=0, how='any', subset=cols).assign(missing_flag=0)
            filterdf = filterdf.merge(df, how='outer', on=df_cols)
            filterdf["missing_flag"] = filterdf["missing_flag"].fillna(1)
            output_dict[name] = filterdf
            df = (check_missing_per_tbl(df, cols)
                  .reset_index(drop=False)
                  .assign(tbl=name)
                  .rename(columns={'index': 'column'}))
            df_list.append(df)


        df_qc_missing = pd.concat(df_list)
        report_path = os.path.join(cls.output_loc, 'qc_missing.csv')
        file_to_write = pdToStringIO(df_qc_missing)
        DataIO.write(DataIO.RepoType.LOCAL, 'file://' + report_path, content=file_to_write, query=None, username=None,
                     password=None, write_type=DataIO.WriteType.NEW)

        #df_qc_missing.to_csv(report_path, index=False)
        message = "mark_missing - {}".format(report_path)
        log.info(message)
        print("report location: {}".format(report_path))
        print("...Done")
        return output_dict