def create_links(self): """ Verify we can store LINKAGE rows """ cache = RuleEntity.get_rules_cache(self.session) added_date = datetime.now() ahash = binascii.unhexlify('2B2D67AED8D511E6A41AF45C898E9B67'.encode()) rule_id = cache.get(RULE_CODE_F_L_D_R) uuid_list = [utils.get_uuid(), utils.get_uuid()] links = [] for uuid in uuid_list: link = LinkageEntity.create(partner_code='UFH', rule_id=rule_id, linkage_patid='123', linkage_flag=0, linkage_uuid=uuid, linkage_hash=ahash, linkage_added_at=added_date) self.assertIsNotNone(link.id) links.append(link) print(link) # Search links matching a hash links_by_hash = self.session.query(LinkageEntity).filter_by( linkage_hash=ahash).all() self.assertIsNotNone(links_by_hash) self.assertTrue(len(links_by_hash) == 2) unique = LinkageEntity.get_unique_uuids(links, links_by_hash) self.assertTrue(len(unique) == 2) print(unique)
def create_links(self): """ Verify we can store LINKAGE rows """ pers_uuid = utils.get_uuid() # For the real code we cun just copy the ahash = binascii.unhexlify('2B2D67AED8D511E6A41AF45C898E9B67'.encode()) added_date = datetime.now() cache = RuleEntity.get_rules_cache(self.session) rule_id = cache.get(RULE_CODE_F_L_D_R) link = LinkageEntity.create(partner_code='UFH', rule_id=rule_id, linkage_patid='123', linkage_flag=0, linkage_uuid=pers_uuid, linkage_hash=ahash, linkage_added_at=added_date) self.assertIsNotNone(link.id) print(link) # Search links matching a hash -- should return at most one row links_by_hash = self.session.query(LinkageEntity).filter_by( linkage_hash=ahash).all() self.assertIsNotNone(links_by_hash)
def create_rules(self): """ Create rule rows """ added_date = datetime.now() # with self.assertRaises(NoResultFound): # self.session.query(RuleEntity).filter_by(id=1).one() rule_r = self.session.query(RuleEntity).filter_by( rule_code=RULE_CODE_F_L_D_R).one_or_none() rule_s = self.session.query(RuleEntity).filter_by( rule_code=RULE_CODE_F_L_D_S).one_or_none() rule_x = self.session.query(RuleEntity).filter_by( rule_code=RULE_CODE_NO_HASH).one_or_none() if rule_r is None: RuleEntity.create(rule_code=RULE_CODE_F_L_D_R, rule_description='First Last DOB Race', rule_added_at=added_date) if rule_s is None: RuleEntity.create(rule_code=RULE_CODE_F_L_D_S, rule_description='First Last DOB Sex', rule_added_at=added_date) if rule_x is None: RuleEntity.create(rule_code=RULE_CODE_NO_HASH, rule_description='First Last DOB Sex', rule_added_at=added_date) # self.assertEqual(2, rule.id) cache = RuleEntity.get_rules_cache(self.session) self.assertIsNotNone(cache) print(cache)
def _process_frame(cls, config, session, df_source, partner_code): """ """ # Init an empty frame and copy the patid from the source df = pd.DataFrame() df['PATID'] = df_source['PATID'] investigations = [] hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source) mapped_hashes = {ahash: link.linkage_uuid for ahash, link in hash_uuid_lut.items() if link is not None} rules_cache = RuleEntity.get_rules_cache(session) cls.log.debug("Found {} linked hashes in db: {}" .format(len(mapped_hashes), mapped_hashes)) # the rules are ordered by their importance rules = config['ENABLED_RULES'].values() patients_with_no_hashes = [] for index, row in df_source.iterrows(): patid = row['PATID'] pat_hashes = {rule: row[rule] for rule in rules if row[rule] != ''} if len(pat_hashes) < 1: patients_with_no_hashes.append(patid) cls.log.debug("Parsing row for patient {} with {} hashes" .format(patid, len(pat_hashes))) links, to_investigate = cls._process_patient_row( patid, pat_hashes.copy(), hash_uuid_lut, rules_cache, config, session, partner_code) cls.log.debug("Created {} links for patid: {}".format(len(links), patid)) # noqa if len(to_investigate) > 0: investigations.append(to_investigate) i = 0 for ahash, link in links.items(): i += 1 df.loc[df['PATID'] == patid, 'UUID'] = (link.linkage_uuid if link else '') df.loc[df['PATID'] == patid, "hash_{}".format(i)] = ahash cls.log.warning("{} out of {} patients are missing both hashes: {}" .format(len(patients_with_no_hashes), len(df), patients_with_no_hashes)) return df, investigations
def _process_frame(cls, config, session, df_source, partner_code): """ """ # Init an empty frame and copy the patid from the source df = pd.DataFrame() df['PATID'] = df_source['PATID'] hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source) mapped_hashes = { ahash: link.linkage_uuid for ahash, link in hash_uuid_lut.items() if link is not None } rules_cache = RuleEntity.get_rules_cache(session) cls.log.debug("Found {} linked hashes in db: {}".format( len(mapped_hashes), mapped_hashes)) # the rules are ordered by their importance rules = config['ENABLED_RULES'].values() # Create list of patients w/o hashes patients_with_no_hashes = [ row['PATID'] for index, row in df_source.iterrows() if len({rule: row[rule] for rule in rules if row[rule] != ''}) < 1 ] # TODO: Insert multiprocessing pool here pool = mp.Pool(processes=NUM_CPUS) df, investigations = cls._process_patient_row_main( cls, pool, df, df_source, rules, hash_uuid_lut, rules_cache, config, partner_code) cls.log.warning( "{} out of {} patients are missing both hashes: {}".format( len(patients_with_no_hashes), len(df), patients_with_no_hashes)) return df, investigations
def _process_frame(cls, config, df_source, partner_code): """ Process a fragment of the large file. Note: since the `session` object can't be pickled we create the session in every call (see commit bde49a90) .. seealso:: :meth:`generate` """ engine = db_utils.get_db_engine(config) session = db_utils.get_db_session(engine) # Init an empty frame and copy the patid from the source df = pd.DataFrame() df['PATID'] = df_source['PATID'] investigations = [] hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source) mapped_hashes = {ahash: link.linkage_uuid for ahash, link in hash_uuid_lut.items() if link is not None} rules_cache = RuleEntity.get_rules_cache(session) cls.log.debug("Found {} linked hashes in db: {}" .format(len(mapped_hashes), mapped_hashes)) # the rules are ordered by their importance rules = config['ENABLED_RULES'].values() patients_with_no_hashes = [] for index, row in df_source.iterrows(): patid = row['PATID'] pat_hashes = {rule: row[rule] for rule in rules if row[rule] != ''} if len(pat_hashes) < 1: patients_with_no_hashes.append(patid) cls.log.debug("Parsing row for patient {} with {} hashes" .format(patid, len(pat_hashes))) links, to_investigate = cls._process_patient_row( patid, pat_hashes.copy(), hash_uuid_lut, rules_cache, config, session, partner_code) cls.log.debug("Created {} links for patid: {}".format(len(links), patid)) # noqa if len(to_investigate) > 0: investigations.append(to_investigate) i = 0 for ahash, link in links.items(): i += 1 df.loc[df['PATID'] == patid, 'UUID'] = (link.linkage_uuid if link else '') df.loc[df['PATID'] == patid, "hash_{}".format(i)] = ahash len_missing_both = len(patients_with_no_hashes) if len_missing_both > 0: cls.log.warning("Patients with no hashes: {} (out of {}). {}" .format(len_missing_both, len(df), patients_with_no_hashes)) return df, investigations
def _process_frame(cls, config, session, df_source, partner_code): """ """ # Init an empty frame and copy the patid from the source df = pd.DataFrame() df['PATID'] = df_source['PATID'] investigations = [] hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source) mapped_hashes = {ahash: link.linkage_uuid for ahash, link in hash_uuid_lut.items() if link is not None} rules_cache = RuleEntity.get_rules_cache(session) cls.log.debug("Found {} linked hashes in db: {}" .format(len(mapped_hashes), mapped_hashes)) # the rules are ordered by their importance rules = config['ENABLED_RULES'].values() patients_with_no_hashes = [] # TODO: Insert multiprocessing pool here jobs = [] frames = [] pool = mp.Pool(processes=NUM_CPUS) # TODO: take for idx,row... loop and create function to call with # utils.apply_async and pass args for index, row in df_source.iterrows(): patid = row['PATID'] pat_hashes = {rule: row[rule] for rule in rules if row[rule] != ''} if len(pat_hashes) < 1: patients_with_no_hashes.append(patid) cls.log.debug("Parsing row for patient {} with {} hashes" .format(patid, len(pat_hashes))) # Extract results from a pool.apply_aync call using get() # See: # https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.AsyncResult job = utils.apply_async(pool, cls._process_patient_row, (patid, pat_hashes.copy(), hash_uuid_lut, rules_cache, config, partner_code)) jobs.append(job) job_count = len(jobs) cls.log.info("Total multiproc jobs: {}".format(job_count)) # collect the results for index, job in enumerate(jobs): try: frames.append(job.get()) if index % 10 == 0: cls.log.info("Got results for frame {} (out of {})" .format(index, job_count)) except Exception as exc: cls.log.error("Job [{}] error: {}".format(index, exc)) mp.get_log().error(traceback.format_exc()) pool.close() pool.join() cls.log.info("Got all {} frames. Concatenating...".format(job_count)) # TODO: setup loop over frames to extract links and to_investigate # Kludgy as we iterate again over df_source.iterrows() to extract PATID for frame, row in zip(frames, df_source.iterrows()): patid = row[1]['PATID'] links = frame[0] to_investigate = frame[1] # !OJO! Printing to console/writing to log can slow processes... # Do we need to know the number of links created for ea patid? # Is it possible to write this to the df and then to SQL table/csv file? cls.log.debug("Created {} links for patid: {}".format(len(links), patid)) # noqa if len(to_investigate) > 0: investigations.append(to_investigate) i = 0 for ahash, link in links.items(): i += 1 df.loc[df['PATID'] == patid, 'UUID'] = (link.linkage_uuid if link else '') df.loc[df['PATID'] == patid, "hash_{}".format(i)] = ahash job_count = len(jobs) cls.log.info("Total multiproc jobs: {}".format(job_count)) cls.log.warning("{} out of {} patients are missing both hashes: {}" .format(len(patients_with_no_hashes), len(df), patients_with_no_hashes)) return df, investigations