def setUp(self): """ Set the test database engine/session """ super(BaseTestCase, self).setUp() config = Config(root_path='.', defaults={}) config.from_pyfile(SETTINGS_FILE) self.config = config self.engine = db.get_db_engine(config) self.session = db.get_db_session(self.engine, create_tables=True) self.create_rules() self.create_partners()
def _process_frame(cls, config, df_source, partner_code): """ Process a fragment of the large file. Note: since the `session` object can't be pickled we create the session in every call (see commit bde49a90) .. seealso:: :meth:`generate` """ engine = db_utils.get_db_engine(config) session = db_utils.get_db_session(engine) # Init an empty frame and copy the patid from the source df = pd.DataFrame() df['PATID'] = df_source['PATID'] investigations = [] hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source) mapped_hashes = {ahash: link.linkage_uuid for ahash, link in hash_uuid_lut.items() if link is not None} rules_cache = RuleEntity.get_rules_cache(session) cls.log.debug("Found {} linked hashes in db: {}" .format(len(mapped_hashes), mapped_hashes)) # the rules are ordered by their importance rules = config['ENABLED_RULES'].values() patients_with_no_hashes = [] for index, row in df_source.iterrows(): patid = row['PATID'] pat_hashes = {rule: row[rule] for rule in rules if row[rule] != ''} if len(pat_hashes) < 1: patients_with_no_hashes.append(patid) cls.log.debug("Parsing row for patient {} with {} hashes" .format(patid, len(pat_hashes))) links, to_investigate = cls._process_patient_row( patid, pat_hashes.copy(), hash_uuid_lut, rules_cache, config, session, partner_code) cls.log.debug("Created {} links for patid: {}".format(len(links), patid)) # noqa if len(to_investigate) > 0: investigations.append(to_investigate) i = 0 for ahash, link in links.items(): i += 1 df.loc[df['PATID'] == patid, 'UUID'] = (link.linkage_uuid if link else '') df.loc[df['PATID'] == patid, "hash_{}".format(i)] = ahash len_missing_both = len(patients_with_no_hashes) if len_missing_both > 0: cls.log.warning("Patients with no hashes: {} (out of {}). {}" .format(len_missing_both, len(df), patients_with_no_hashes)) return df, investigations
def _process_patient_row(cls, patid, pat_hashes, hash_uuid_lut, rules_cache, config, partner_code): """ :return OrderedDict: with the newly created linkage entities """ # TODO: need to create engine and session within for...loop # to take advantage of multi-core session = db.get_db_session(db.get_db_engine(config), create_tables=True) cls.log.debug("Parsing row for patient {} with {} hashes".format( patid, len(pat_hashes))) links = {} to_investigate = {} if len(pat_hashes) == 0: cls.log.warn("Patient [{}] does not have any hashes".format(patid)) # create a link anyway uuid = utils.get_uuid() flag = FLAG_HASH_NOT_FOUND new_link = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(RULE_CODE_NO_HASH), linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=None, linkage_added_at=datetime.now()) # TODO: add code to return this special link elif len(pat_hashes) == 1: # only one hash was received rule_code, ahash = pat_hashes.popitem() existing_link = hash_uuid_lut.get(ahash) binary_hash = unhexlify(ahash.encode('utf-8')) if existing_link is None: # create new UUID uuid = utils.get_uuid() flag = FLAG_HASH_NOT_FOUND else: # reuse the existing UUID uuid = existing_link.linkage_uuid flag = FLAG_HASH_FOUND new_link = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code), # we need the rule_id here linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=binary_hash, linkage_added_at=datetime.now()) if rule_code == RULE_CODE_F_L_D_R: links = {ahash: new_link} else: links = {'': None, ahash: new_link} elif len(pat_hashes) == 2: links, to_investigate = cls._process_two_hashes( patid, pat_hashes, hash_uuid_lut, rules_cache, config, session, partner_code) return links, to_investigate
def generate(cls, config, inputdir, outputdir, partner): """ Read the "phi_hashes.csv" file and generate UUID's. Optionally save the "hash -> UUID" mapping to "links.csv" """ cls._validate_config(config) engine = db.get_db_engine(config) # pass a session object to avoid creating it in the loop # TODO: add a method parameter for controlling the `create_table` flag session = db.get_db_session(engine, create_tables=True) EXPECTED_COLS = config['EXPECTED_COLS'] SAVE_OUT_FILE = config['SAVE_OUT_FILE'] in_file = os.path.join(inputdir, config['IN_FILE']) reader = None try: reader = pd.read_csv(in_file, sep=config['IN_DELIMITER'], dtype=object, skipinitialspace=True, skip_blank_lines=True, usecols=list(EXPECTED_COLS), chunksize=config['LINES_PER_CHUNK'], iterator=True) cls.log.info("Reading data from file: {} ({})".format( in_file, utils.get_file_size(in_file))) except ValueError as exc: cls.log.info("Please check if the actual column names" " in [{}] match the expected column names" " file: {}.".format(in_file, sorted(EXPECTED_COLS))) cls.log.error("Error: {}".format(exc)) frames = [] investigations = [] for df_source in reader: df_source.fillna('', inplace=True) # The magic happens here... df, to_investigate = cls._process_frame(config, session, df_source, partner) if SAVE_OUT_FILE: frames.append(df) investigations.extend(to_investigate) if SAVE_OUT_FILE: df = pd.concat(frames, ignore_index=True) out_file = os.path.join(outputdir, config['OUT_FILE']) out_file_investigate = os.path.join(outputdir, config['OUT_FILE_INVESTIGATE']) utils.frame_to_file(df, out_file, delimiter=config['OUT_DELIMITER']) cls.log.info("Wrote output file: {} ({} data rows, {})".format( out_file, len(df), utils.get_file_size(out_file))) with open(out_file_investigate, 'w') as invf: for line in investigations: invf.write("{}\n".format(line)) cls.log.info("Wrote hashes that need investigation to {}".format( out_file_investigate)) return True