Ejemplo n.º 1
0
    def save_output_file(cls, config, outputdir, job_count, frames,
                         investigations):

        cls.log.info("Got {} frames. Concatenating...".format(job_count))
        df = pd.concat(frames, ignore_index=True)

        out_file = os.path.join(outputdir, config['OUT_FILE'])
        out_file_investigate = os.path.join(
            outputdir, config['OUT_FILE_INVESTIGATE'])  # noqa
        utils.frame_to_file(df, out_file, delimiter=config['OUT_DELIMITER'])
        cls.log.info("Wrote output file: {} ({} data rows, {})".format(
            out_file, len(df), utils.get_file_size(out_file)))

        with open(out_file_investigate, 'w') as invf:
            for line in investigations:
                invf.write("{}\n".format(line))

        cls.log.info("Wrote hashes that need investigation to {}".format(
            out_file_investigate))
Ejemplo n.º 2
0
    def generate(cls, config, inputdir, outputdir, partner,
                 ask=True, create_tables=True):
        """
        Read the "phi_hashes.csv" file and generate UUID's.

        Optionally save the "hash -> UUID" mapping to "links.csv"

       .. seealso::
           :meth:`_process_frame`

        """
        cls._validate_config(config)
        # engine = db_utils.get_db_engine(config)
        EXPECTED_COLS = config['EXPECTED_COLS']
        SAVE_OUT_FILE = config['SAVE_OUT_FILE']
        in_file = os.path.join(inputdir, config['IN_FILE'])

        cls.log.info("Using [{}] as source folder".format(inputdir))
        cls.log.info("Using [{}] as source file".format(in_file))
        cls.log.info("Connection HOST:DB - {}/{}"
                     .format(config['DB_HOST'], config['DB_NAME']))

        if ask:
            confirmed = utils.ask_yes_no(
                "Continue link procedure to create files in the [{}] folder?"
                .format(outputdir))

            if not confirmed:
                sys.exit("If you say so...")

        reader = None

        try:
            reader = pd.read_csv(in_file,
                                 sep=config['IN_DELIMITER'],
                                 dtype=object,
                                 skipinitialspace=True,
                                 skip_blank_lines=True,
                                 usecols=list(EXPECTED_COLS),
                                 chunksize=config['LINES_PER_CHUNK'],
                                 iterator=True)
            cls.log.info("Reading data from file: {} ({})"
                         .format(in_file, utils.get_file_size(in_file)))

        except ValueError as exc:
            cls.log.info("Please check if the actual column names"
                         " in [{}] match the expected column names"
                         " file: {}.".format(in_file,
                                             sorted(EXPECTED_COLS)))
            cls.log.error("Error: {}".format(exc))

        frames = []
        investigations = []
        pool = mp.Pool(processes=NUM_CPUS)
        jobs = []

        for index, df_source in enumerate(reader):
            df_source.fillna('', inplace=True)
            # The magic happens here...
            job = utils.apply_async(pool,
                                    cls._process_frame,
                                    (config, df_source, partner))
            jobs.append(job)

        job_count = len(jobs)
        cls.log.info("Total multiproc jobs: {}".format(job_count))

        # collect the results
        for index, job in enumerate(jobs):
            try:
                df_temp, to_investigate = job.get()
                frames.append(df_temp)
                investigations.extend(to_investigate)

                if index % 100 == 0:
                    cls.log.info("Appended result {} (out of {})"
                                 .format(index, job_count))
            except Exception as exc:
                cls.log.error("Job [{}] error: {}".format(index, exc))
                mp.get_log().error(traceback.format_exc())

        pool.close()
        pool.join()

        if SAVE_OUT_FILE:
            cls.log.info("Got {} frames. Concatenating...".format(job_count))
            df = pd.concat(frames, ignore_index=True)

            out_file = os.path.join(outputdir, config['OUT_FILE'])
            out_file_investigate = os.path.join(outputdir, config['OUT_FILE_INVESTIGATE'])  # noqa
            utils.frame_to_file(df, out_file,
                                delimiter=config['OUT_DELIMITER'])
            cls.log.info("Wrote output file: {} ({} data rows, {})"
                         .format(
                             out_file, len(df), utils.get_file_size(out_file)))

            with open(out_file_investigate, 'w') as invf:

                for line in investigations:
                    invf.write("{}\n".format(line))

            cls.log.info("Wrote hashes that need investigation to {}"
                         .format(out_file_investigate))

        return True
Ejemplo n.º 3
0
    def generate(cls, config, inputdir, outputdir):
        """
        Read the "phi.csv" file and generate "phi_hashes.csv"
        containing two (or more) sha256 strings for each line
        in the input file.

        This method is invoked from

        .. seealso::

            run_hasher.py

        :param inputdir: directory name for the source file
        :param outputdir: directory name for generated file

        :rtype: DataFrame
        :return the frame with hashes of the PHI data

        Columns:
            - patid
            - sha_rule_1 (first_last_dob_race)
            - sha_rule_2 (first_last_dob_sex)

        """
        cls._validate_config(config)
        EXPECTED_COLS = config['EXPECTED_COLS']
        ENABLED_RULES = config.get('ENABLED_RULES')

        cls.log.info("Using [{}] as source folder".format(inputdir))
        cls.log.info("Using [{}] as salt".format(config['SALT']))
        cls.log.info("Expecting input file to contain columns: {}".format(
            EXPECTED_COLS))
        cls.log.info("Using [{}] as destination folder".format(outputdir))

        in_file = os.path.join(inputdir, config['IN_FILE'])
        reader = None

        try:
            reader = pd.read_csv(in_file,
                                 sep=config['IN_DELIMITER'],
                                 dtype=object,
                                 skipinitialspace=True,
                                 skip_blank_lines=True,
                                 usecols=list(EXPECTED_COLS),
                                 chunksize=config['LINES_PER_CHUNK'],
                                 iterator=True)
            cls.log.info("Reading data from file: {} ({})".format(
                in_file, utils.get_file_size(in_file)))

        except ValueError as exc:
            cls.log.info("Please check if the actual column names"
                         " in [{}] match the expected column names"
                         " file: {}.".format(in_file, sorted(EXPECTED_COLS)))
            cls.log.error("Error: {}".format(exc))

        frames = []
        pool = mp.Pool(processes=NUM_CPUS)
        jobs = []

        for index, df_source in enumerate(reader):
            cls.log.info("Processing {} lines of frame {}".format(
                config['LINES_PER_CHUNK'], index))
            df_source.fillna('', inplace=True)

            for col in EXPECTED_COLS:
                if col not in sorted(df_source):
                    raise Exception(
                        "The input data frame does not have all "
                        "expected columns: {}".format(EXPECTED_COLS))

            # validate the values constrained to set
            invalid_race = df_source.loc[
                ~df_source['race'].isin(VALID_RACE_VALS)]  # noqa
            invalid_sex = df_source.loc[~df_source['sex'].isin(VALID_SEX_VALS)]

            if len(invalid_race) > 0:
                cls.log.info("Please check race: {}".format(invalid_race))
                raise Exception("The input file contains invalid value for "
                                "`race` column. Please review the specs.")
            if len(invalid_sex) > 0:
                cls.log.warning("Please check sex: {}".format(invalid_sex))
                raise Exception("The input file contains invalid value for "
                                "`sex` column. Please review the specs.")

            job = utils.apply_async(pool, cls._process_frame,
                                    (df_source, config))
            jobs.append(job)

        job_count = len(jobs)
        cls.log.info("Total multiproc jobs: {}".format(job_count))

        # collect the results
        for index, job in enumerate(jobs):
            try:
                frames.append(job.get())
                if index % 10 == 0:
                    cls.log.info("Got results for frame {} (out of {})".format(
                        index, job_count))
            except Exception as exc:
                cls.log.error("Job [{}] error: {}".format(index, exc))
                mp.get_log().error(traceback.format_exc())

        pool.close()
        pool.join()
        cls.log.info("Got all {} frames. Concatenating...".format(job_count))
        df = pd.concat(frames, ignore_index=True)

        # Concatenation can re-order columns so we need to enforce the order
        out_columns = ['PATID']
        out_columns.extend(ENABLED_RULES)

        out_file = os.path.join(outputdir, config['OUT_FILE'])
        utils.frame_to_file(df[out_columns],
                            out_file,
                            delimiter=config['OUT_DELIMITER'])

        cls.log.info("Wrote output file: {} ({} data rows, {})".format(
            out_file, len(df), utils.get_file_size(out_file)))

        # count how many patients did not get a hash generated
        # query_no_hashes = df.query('m == "" & n == ""')
        query_no_hashes = ' & '.join(
            ['{} == ""'.format(rule) for rule in ENABLED_RULES])
        df_no_hashes = df.query(query_no_hashes)
        cls.log.info("The result file contains [{}] patients without hashes."
                     " See some examples below.".format(len(df_no_hashes)))
        cls.log.info(df_no_hashes.head())
        return True
Ejemplo n.º 4
0
    def generate(cls, config, inputdir, outputdir, partner):
        """
        Read the "phi_hashes.csv" file and generate UUID's.

        Optionally save the "hash -> UUID" mapping to "links.csv"
        """
        cls._validate_config(config)
        engine = db.get_db_engine(config)

        # pass a session object to avoid creating it in the loop
        # TODO: add a method parameter for controlling the `create_table` flag
        session = db.get_db_session(engine, create_tables=True)

        EXPECTED_COLS = config['EXPECTED_COLS']
        SAVE_OUT_FILE = config['SAVE_OUT_FILE']
        in_file = os.path.join(inputdir, config['IN_FILE'])

        reader = None

        try:
            reader = pd.read_csv(in_file,
                                 sep=config['IN_DELIMITER'],
                                 dtype=object,
                                 skipinitialspace=True,
                                 skip_blank_lines=True,
                                 usecols=list(EXPECTED_COLS),
                                 chunksize=config['LINES_PER_CHUNK'],
                                 iterator=True)
            cls.log.info("Reading data from file: {} ({})".format(
                in_file, utils.get_file_size(in_file)))

        except ValueError as exc:
            cls.log.info("Please check if the actual column names"
                         " in [{}] match the expected column names"
                         " file: {}.".format(in_file, sorted(EXPECTED_COLS)))
            cls.log.error("Error: {}".format(exc))

        frames = []
        investigations = []

        for df_source in reader:
            df_source.fillna('', inplace=True)
            # The magic happens here...
            df, to_investigate = cls._process_frame(config, session, df_source,
                                                    partner)
            if SAVE_OUT_FILE:
                frames.append(df)
                investigations.extend(to_investigate)

        if SAVE_OUT_FILE:
            df = pd.concat(frames, ignore_index=True)
            out_file = os.path.join(outputdir, config['OUT_FILE'])
            out_file_investigate = os.path.join(outputdir,
                                                config['OUT_FILE_INVESTIGATE'])
            utils.frame_to_file(df,
                                out_file,
                                delimiter=config['OUT_DELIMITER'])
            cls.log.info("Wrote output file: {} ({} data rows, {})".format(
                out_file, len(df), utils.get_file_size(out_file)))

            with open(out_file_investigate, 'w') as invf:
                for line in investigations:
                    invf.write("{}\n".format(line))
            cls.log.info("Wrote hashes that need investigation to {}".format(
                out_file_investigate))
        return True
Ejemplo n.º 5
0
    def generate(cls,
                 config,
                 inputdir,
                 outputdir,
                 partner,
                 ask=True,
                 create_tables=True):
        """
        Read the "phi_hashes.csv" file and generate UUID's.

        Optionally save the "hash -> UUID" mapping to "links.csv"

       .. seealso::
           :meth:`_process_frame`

        """
        cls._validate_config(config)
        in_file = os.path.join(inputdir, config['IN_FILE'])
        cls.log.info("Using {} as input file ({})".format(
            in_file, utils.get_file_size(in_file)))  # noqa
        cls.log.info("Connection HOST:DB - {}/{}".format(
            config['DB_HOST'], config['DB_NAME']))

        if ask:
            confirmed = utils.ask_yes_no(
                "Run [{}] linkage to create files in the [{}] folder?".format(
                    partner, outputdir))

            if not confirmed:
                sys.exit("If you say so...")

        df = cls._prepare_frame(config, inputdir, outputdir)

        frames = []
        investigations = []
        pool = mp.Pool(processes=NUM_CPUS)
        jobs = []

        chunksize = config['LINES_PER_CHUNK']

        # for index, df_source in enumerate(reader):
        for index, group in df.groupby(np.arange(len(df)) // chunksize):
            # cls.log.error("Frame chunk [{}]".format(index))
            df_source = pd.DataFrame(group)
            # The magic happens here...
            job = utils.apply_async(pool, cls._process_frame,
                                    (config, df_source, partner))
            jobs.append(job)

        job_count = len(jobs)
        cls.log.info("Total multiproc jobs: {}".format(job_count))

        # collect the results
        for index, job in enumerate(jobs):
            try:
                df_temp, to_investigate = job.get()
                frames.append(df_temp)
                investigations.extend(to_investigate)

                if index % 100 == 0:
                    cls.log.info("Appended result {} (out of {})".format(
                        index, job_count))
            except Exception as exc:
                cls.log.error("Job [{}] error: {}".format(index, exc))
                cls.log.error(traceback.format_exc())

        pool.close()
        pool.join()

        if config['SAVE_OUT_FILE']:
            cls.save_output_file(config, outputdir, job_count, frames,
                                 investigations)

        return True