def generate(cls, config, inputdir, outputdir, partner, ask=True, create_tables=True): """ Read the "phi_hashes.csv" file and generate UUID's. Optionally save the "hash -> UUID" mapping to "links.csv" .. seealso:: :meth:`_process_frame` """ cls._validate_config(config) # engine = db_utils.get_db_engine(config) EXPECTED_COLS = config['EXPECTED_COLS'] SAVE_OUT_FILE = config['SAVE_OUT_FILE'] in_file = os.path.join(inputdir, config['IN_FILE']) cls.log.info("Using [{}] as source folder".format(inputdir)) cls.log.info("Using [{}] as source file".format(in_file)) cls.log.info("Connection HOST:DB - {}/{}" .format(config['DB_HOST'], config['DB_NAME'])) if ask: confirmed = utils.ask_yes_no( "Continue link procedure to create files in the [{}] folder?" .format(outputdir)) if not confirmed: sys.exit("If you say so...") reader = None try: reader = pd.read_csv(in_file, sep=config['IN_DELIMITER'], dtype=object, skipinitialspace=True, skip_blank_lines=True, usecols=list(EXPECTED_COLS), chunksize=config['LINES_PER_CHUNK'], iterator=True) cls.log.info("Reading data from file: {} ({})" .format(in_file, utils.get_file_size(in_file))) except ValueError as exc: cls.log.info("Please check if the actual column names" " in [{}] match the expected column names" " file: {}.".format(in_file, sorted(EXPECTED_COLS))) cls.log.error("Error: {}".format(exc)) frames = [] investigations = [] pool = mp.Pool(processes=NUM_CPUS) jobs = [] for index, df_source in enumerate(reader): df_source.fillna('', inplace=True) # The magic happens here... job = utils.apply_async(pool, cls._process_frame, (config, df_source, partner)) jobs.append(job) job_count = len(jobs) cls.log.info("Total multiproc jobs: {}".format(job_count)) # collect the results for index, job in enumerate(jobs): try: df_temp, to_investigate = job.get() frames.append(df_temp) investigations.extend(to_investigate) if index % 100 == 0: cls.log.info("Appended result {} (out of {})" .format(index, job_count)) except Exception as exc: cls.log.error("Job [{}] error: {}".format(index, exc)) mp.get_log().error(traceback.format_exc()) pool.close() pool.join() if SAVE_OUT_FILE: cls.log.info("Got {} frames. Concatenating...".format(job_count)) df = pd.concat(frames, ignore_index=True) out_file = os.path.join(outputdir, config['OUT_FILE']) out_file_investigate = os.path.join(outputdir, config['OUT_FILE_INVESTIGATE']) # noqa utils.frame_to_file(df, out_file, delimiter=config['OUT_DELIMITER']) cls.log.info("Wrote output file: {} ({} data rows, {})" .format( out_file, len(df), utils.get_file_size(out_file))) with open(out_file_investigate, 'w') as invf: for line in investigations: invf.write("{}\n".format(line)) cls.log.info("Wrote hashes that need investigation to {}" .format(out_file_investigate)) return True
def generate(cls, config, inputdir, outputdir, partner, ask=True, create_tables=True): """ Read the "phi_hashes.csv" file and generate UUID's. Optionally save the "hash -> UUID" mapping to "links.csv" .. seealso:: :meth:`_process_frame` """ cls._validate_config(config) in_file = os.path.join(inputdir, config['IN_FILE']) cls.log.info("Using {} as input file ({})".format( in_file, utils.get_file_size(in_file))) # noqa cls.log.info("Connection HOST:DB - {}/{}".format( config['DB_HOST'], config['DB_NAME'])) if ask: confirmed = utils.ask_yes_no( "Run [{}] linkage to create files in the [{}] folder?".format( partner, outputdir)) if not confirmed: sys.exit("If you say so...") df = cls._prepare_frame(config, inputdir, outputdir) frames = [] investigations = [] pool = mp.Pool(processes=NUM_CPUS) jobs = [] chunksize = config['LINES_PER_CHUNK'] # for index, df_source in enumerate(reader): for index, group in df.groupby(np.arange(len(df)) // chunksize): # cls.log.error("Frame chunk [{}]".format(index)) df_source = pd.DataFrame(group) # The magic happens here... job = utils.apply_async(pool, cls._process_frame, (config, df_source, partner)) jobs.append(job) job_count = len(jobs) cls.log.info("Total multiproc jobs: {}".format(job_count)) # collect the results for index, job in enumerate(jobs): try: df_temp, to_investigate = job.get() frames.append(df_temp) investigations.extend(to_investigate) if index % 100 == 0: cls.log.info("Appended result {} (out of {})".format( index, job_count)) except Exception as exc: cls.log.error("Job [{}] error: {}".format(index, exc)) cls.log.error(traceback.format_exc()) pool.close() pool.join() if config['SAVE_OUT_FILE']: cls.save_output_file(config, outputdir, job_count, frames, investigations) return True
def generate(cls, config, inputdir, outputdir): """ Read the "phi.csv" file and generate "phi_hashes.csv" containing two (or more) sha256 strings for each line in the input file. This method is invoked from .. seealso:: run_hasher.py :param inputdir: directory name for the source file :param outputdir: directory name for generated file :rtype: DataFrame :return the frame with hashes of the PHI data Columns: - patid - sha_rule_1 (first_last_dob_race) - sha_rule_2 (first_last_dob_sex) """ cls._validate_config(config) EXPECTED_COLS = config['EXPECTED_COLS'] ENABLED_RULES = config.get('ENABLED_RULES') cls.log.info("Using [{}] as source folder".format(inputdir)) cls.log.info("Using [{}] as salt".format(config['SALT'])) cls.log.info("Expecting input file to contain columns: {}".format( EXPECTED_COLS)) cls.log.info("Using [{}] as destination folder".format(outputdir)) in_file = os.path.join(inputdir, config['IN_FILE']) reader = None try: reader = pd.read_csv(in_file, sep=config['IN_DELIMITER'], dtype=object, skipinitialspace=True, skip_blank_lines=True, usecols=list(EXPECTED_COLS), chunksize=config['LINES_PER_CHUNK'], iterator=True) cls.log.info("Reading data from file: {} ({})".format( in_file, utils.get_file_size(in_file))) except ValueError as exc: cls.log.info("Please check if the actual column names" " in [{}] match the expected column names" " file: {}.".format(in_file, sorted(EXPECTED_COLS))) cls.log.error("Error: {}".format(exc)) frames = [] pool = mp.Pool(processes=NUM_CPUS) jobs = [] for index, df_source in enumerate(reader): cls.log.info("Processing {} lines of frame {}".format( config['LINES_PER_CHUNK'], index)) df_source.fillna('', inplace=True) for col in EXPECTED_COLS: if col not in sorted(df_source): raise Exception( "The input data frame does not have all " "expected columns: {}".format(EXPECTED_COLS)) # validate the values constrained to set invalid_race = df_source.loc[ ~df_source['race'].isin(VALID_RACE_VALS)] # noqa invalid_sex = df_source.loc[~df_source['sex'].isin(VALID_SEX_VALS)] if len(invalid_race) > 0: cls.log.info("Please check race: {}".format(invalid_race)) raise Exception("The input file contains invalid value for " "`race` column. Please review the specs.") if len(invalid_sex) > 0: cls.log.warning("Please check sex: {}".format(invalid_sex)) raise Exception("The input file contains invalid value for " "`sex` column. Please review the specs.") job = utils.apply_async(pool, cls._process_frame, (df_source, config)) jobs.append(job) job_count = len(jobs) cls.log.info("Total multiproc jobs: {}".format(job_count)) # collect the results for index, job in enumerate(jobs): try: frames.append(job.get()) if index % 10 == 0: cls.log.info("Got results for frame {} (out of {})".format( index, job_count)) except Exception as exc: cls.log.error("Job [{}] error: {}".format(index, exc)) mp.get_log().error(traceback.format_exc()) pool.close() pool.join() cls.log.info("Got all {} frames. Concatenating...".format(job_count)) df = pd.concat(frames, ignore_index=True) # Concatenation can re-order columns so we need to enforce the order out_columns = ['PATID'] out_columns.extend(ENABLED_RULES) out_file = os.path.join(outputdir, config['OUT_FILE']) utils.frame_to_file(df[out_columns], out_file, delimiter=config['OUT_DELIMITER']) cls.log.info("Wrote output file: {} ({} data rows, {})".format( out_file, len(df), utils.get_file_size(out_file))) # count how many patients did not get a hash generated # query_no_hashes = df.query('m == "" & n == ""') query_no_hashes = ' & '.join( ['{} == ""'.format(rule) for rule in ENABLED_RULES]) df_no_hashes = df.query(query_no_hashes) cls.log.info("The result file contains [{}] patients without hashes." " See some examples below.".format(len(df_no_hashes))) cls.log.info(df_no_hashes.head()) return True
def _process_frame(cls, config, session, df_source, partner_code): """ """ # Init an empty frame and copy the patid from the source df = pd.DataFrame() df['PATID'] = df_source['PATID'] investigations = [] hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source) mapped_hashes = {ahash: link.linkage_uuid for ahash, link in hash_uuid_lut.items() if link is not None} rules_cache = RuleEntity.get_rules_cache(session) cls.log.debug("Found {} linked hashes in db: {}" .format(len(mapped_hashes), mapped_hashes)) # the rules are ordered by their importance rules = config['ENABLED_RULES'].values() patients_with_no_hashes = [] # TODO: Insert multiprocessing pool here jobs = [] frames = [] pool = mp.Pool(processes=NUM_CPUS) # TODO: take for idx,row... loop and create function to call with # utils.apply_async and pass args for index, row in df_source.iterrows(): patid = row['PATID'] pat_hashes = {rule: row[rule] for rule in rules if row[rule] != ''} if len(pat_hashes) < 1: patients_with_no_hashes.append(patid) cls.log.debug("Parsing row for patient {} with {} hashes" .format(patid, len(pat_hashes))) # Extract results from a pool.apply_aync call using get() # See: # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.AsyncResult job = utils.apply_async(pool, cls._process_patient_row, (patid, pat_hashes.copy(), hash_uuid_lut, rules_cache, config, partner_code)) jobs.append(job) job_count = len(jobs) cls.log.info("Total multiproc jobs: {}".format(job_count)) # collect the results for index, job in enumerate(jobs): try: frames.append(job.get()) if index % 10 == 0: cls.log.info("Got results for frame {} (out of {})" .format(index, job_count)) except Exception as exc: cls.log.error("Job [{}] error: {}".format(index, exc)) mp.get_log().error(traceback.format_exc()) pool.close() pool.join() cls.log.info("Got all {} frames. Concatenating...".format(job_count)) # TODO: setup loop over frames to extract links and to_investigate # Kludgy as we iterate again over df_source.iterrows() to extract PATID for frame, row in zip(frames, df_source.iterrows()): patid = row[1]['PATID'] links = frame[0] to_investigate = frame[1] # !OJO! Printing to console/writing to log can slow processes... # Do we need to know the number of links created for ea patid? # Is it possible to write this to the df and then to SQL table/csv file? cls.log.debug("Created {} links for patid: {}".format(len(links), patid)) # noqa if len(to_investigate) > 0: investigations.append(to_investigate) i = 0 for ahash, link in links.items(): i += 1 df.loc[df['PATID'] == patid, 'UUID'] = (link.linkage_uuid if link else '') df.loc[df['PATID'] == patid, "hash_{}".format(i)] = ahash job_count = len(jobs) cls.log.info("Total multiproc jobs: {}".format(job_count)) cls.log.warning("{} out of {} patients are missing both hashes: {}" .format(len(patients_with_no_hashes), len(df), patients_with_no_hashes)) return df, investigations