def create_links(self): """ Verify we can store LINKAGE rows """ cache = RuleEntity.get_rules_cache(self.session) added_date = datetime.now() ahash = binascii.unhexlify('2B2D67AED8D511E6A41AF45C898E9B67'.encode()) rule_id = cache.get(RULE_CODE_F_L_D_R) uuid_list = [utils.get_uuid(), utils.get_uuid()] links = [] for uuid in uuid_list: link = LinkageEntity.create(partner_code='UFH', rule_id=rule_id, linkage_patid='123', linkage_flag=0, linkage_uuid=uuid, linkage_hash=ahash, linkage_added_at=added_date) self.assertIsNotNone(link.id) links.append(link) print(link) # Search links matching a hash links_by_hash = self.session.query(LinkageEntity).filter_by( linkage_hash=ahash).all() self.assertIsNotNone(links_by_hash) self.assertTrue(len(links_by_hash) == 2) unique = LinkageEntity.get_unique_uuids(links, links_by_hash) self.assertTrue(len(unique) == 2) print(unique)
def create_links(self): """ Verify we can store LINKAGE rows """ pers_uuid = utils.get_uuid() # For the real code we cun just copy the ahash = binascii.unhexlify('2B2D67AED8D511E6A41AF45C898E9B67'.encode()) added_date = datetime.now() cache = RuleEntity.get_rules_cache(self.session) rule_id = cache.get(RULE_CODE_F_L_D_R) link = LinkageEntity.create(partner_code='UFH', rule_id=rule_id, linkage_patid='123', linkage_flag=0, linkage_uuid=pers_uuid, linkage_hash=ahash, linkage_added_at=added_date) self.assertIsNotNone(link.id) print(link) # Search links matching a hash -- should return at most one row links_by_hash = self.session.query(LinkageEntity).filter_by( linkage_hash=ahash).all() self.assertIsNotNone(links_by_hash)
def _process_patient_row(cls, patid, pat_hashes, hash_uuid_lut, rules_cache, config, session, partner_code): """ TODO: This function is not handling the case when we run the linkage for the same patient twice. :return a tuple of OrderedDicts (linkage_entities, sha_to_investigate) """ links = {} to_investigate = {} if len(pat_hashes) == 0: # create a link anyway uuid = utils.get_uuid() flag = FLAG_HASH_NOT_FOUND new_link = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(RULE_CODE_NO_HASH), linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=None, linkage_added_at=datetime.now()) links = {'': new_link} elif len(pat_hashes) == 1: # only one hash was received rule_code, ahash = pat_hashes.popitem() """ TODO: there are multiple cases when the same hash is associated with 2 or 3 different patients from the same partner -- which means that storing and checking only the first link object in the LUT can result in linking of ambiguous hashes. Helper query: select linkage_hash, count(*) cc from linkage where linkage_flag = 2 -- FLAG_SKIP_MATCH and partner_code = 'xyz' group by linkage_hash having count(*) > 1 """ existing_link = hash_uuid_lut.get(ahash) binary_hash = unhexlify(ahash.encode('utf-8')) if existing_link is None: # create new UUID uuid = utils.get_uuid() flag = FLAG_HASH_NOT_FOUND else: # If we find a link with the same hash from the same source # we ignore it and mark it accordingly if existing_link.needs_to_skip_match_for_partner(partner_code): uuid = utils.get_uuid() flag = FLAG_SKIP_MATCH else: # reuse the existing UUID uuid = existing_link.linkage_uuid flag = FLAG_HASH_FOUND new_link = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code), # we need the rule_id here linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=binary_hash, linkage_added_at=datetime.now()) if rule_code == RULE_CODE_F_L_D_R: links = {ahash: new_link} else: links = {'': None, ahash: new_link} elif len(pat_hashes) == 2: links, to_investigate = cls._process_two_hashes( patid, pat_hashes, hash_uuid_lut, rules_cache, config, session, partner_code) return links, to_investigate
def _process_two_hashes(cls, patid, pat_hashes, hash_uuid_lut, rules_cache, config, session, partner_code): """ Note: a `1` in the comments below indicates that a hash is already in the database! We have to cover 2^2 + 1 = 5 cases: 1. h1 => 0, h2 => 0 - create new UUID and use for both rows 2. h1 => 0, h2 => 1 \ _ reuse a UUID 3. h1 => 1, h2 => 0 / 4. h1 => 1, h2 => 1 and UUIDs match - reuse a UUID and create two rows 5. h1 => 1, h2 => 1 and the corresponding UUIDs do NOT match - reuse the UUID but link only the first hash """ links = {} to_investigate = {} added_date = datetime.now() # TODO: This is ugly but we can make (if needed) # the logic work for "n" rules rule_code_1, ahash_1 = pat_hashes.popitem() rule_code_2, ahash_2 = pat_hashes.popitem() existing_link_1 = hash_uuid_lut.get(ahash_1) existing_link_2 = hash_uuid_lut.get(ahash_2) both_not_found = existing_link_1 is None and existing_link_2 is None only_one_found = ((existing_link_1 is None and existing_link_2 is not None) or (existing_link_1 is not None and existing_link_2 is None)) if both_not_found: # create two links with a `fresh` UUID uuid = utils.get_uuid() flag_1 = FLAG_HASH_NOT_FOUND flag_2 = FLAG_HASH_NOT_FOUND new_link_1 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_1), linkage_patid=patid, linkage_flag=flag_1, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_1.encode('utf-8')), linkage_added_at=added_date) new_link_2 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_2), linkage_patid=patid, linkage_flag=flag_2, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_2.encode('utf-8')), linkage_added_at=added_date) elif only_one_found: # reuse the existing UUID if existing_link_1 is not None: flag_2 = FLAG_HASH_NOT_FOUND # TODO: verify the logic: # "two distinct patids with same hash from same partner are # considered different persons" if existing_link_1.needs_to_skip_match_for_partner(partner_code): # noqa uuid = utils.get_uuid() flag_1 = FLAG_SKIP_MATCH else: uuid = existing_link_1.linkage_uuid flag_1 = FLAG_HASH_FOUND else: flag_1 = FLAG_HASH_NOT_FOUND if existing_link_2.needs_to_skip_match_for_partner(partner_code): # noqa uuid = utils.get_uuid() flag_2 = FLAG_SKIP_MATCH else: uuid = existing_link_2.linkage_uuid flag_2 = FLAG_HASH_FOUND new_link_1 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_1), linkage_patid=patid, linkage_flag=flag_1, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_1.encode('utf-8')), linkage_added_at=added_date) new_link_2 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_2), linkage_patid=patid, linkage_flag=flag_2, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_2.encode('utf-8')), linkage_added_at=added_date) else: # both are found if existing_link_1.needs_to_skip_match_for_partner(partner_code): uuid = utils.get_uuid() flag = FLAG_SKIP_MATCH else: uuid = existing_link_1.linkage_uuid flag = FLAG_HASH_FOUND new_link_1 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_1), linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_1.encode('utf-8')), linkage_added_at=added_date) # UUID's match - insert row for the second hash too if (existing_link_1.linkage_uuid == existing_link_2.linkage_uuid): # consensus hence insert row for hash 2 too new_link_2 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_2), linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_2.encode('utf-8')), linkage_added_at=added_date) else: # the UUID's do not match - we need to investigate to_investigate = { ahash_2: [existing_link_1.linkage_uuid, existing_link_2.linkage_uuid] } cls.log.warning("Hashes of the patid [{}] are linked" " to two distinct UUIDs: {}, {}." " We linked only the first hash!" .format(patid, existing_link_1.linkage_uuid, existing_link_2.linkage_uuid)) return {ahash_1: new_link_1}, to_investigate links[ahash_1] = new_link_1 links[ahash_2] = new_link_2 return links, {}
def _process_patient_row(cls, patid, pat_hashes, hash_uuid_lut, rules_cache, config, partner_code): """ :return OrderedDict: with the newly created linkage entities """ # TODO: need to create engine and session within for...loop # to take advantage of multi-core session = db.get_db_session(db.get_db_engine(config), create_tables=True) cls.log.debug("Parsing row for patient {} with {} hashes".format( patid, len(pat_hashes))) links = {} to_investigate = {} if len(pat_hashes) == 0: cls.log.warn("Patient [{}] does not have any hashes".format(patid)) # create a link anyway uuid = utils.get_uuid() flag = FLAG_HASH_NOT_FOUND new_link = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(RULE_CODE_NO_HASH), linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=None, linkage_added_at=datetime.now()) # TODO: add code to return this special link elif len(pat_hashes) == 1: # only one hash was received rule_code, ahash = pat_hashes.popitem() existing_link = hash_uuid_lut.get(ahash) binary_hash = unhexlify(ahash.encode('utf-8')) if existing_link is None: # create new UUID uuid = utils.get_uuid() flag = FLAG_HASH_NOT_FOUND else: # reuse the existing UUID uuid = existing_link.linkage_uuid flag = FLAG_HASH_FOUND new_link = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code), # we need the rule_id here linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=binary_hash, linkage_added_at=datetime.now()) if rule_code == RULE_CODE_F_L_D_R: links = {ahash: new_link} else: links = {'': None, ahash: new_link} elif len(pat_hashes) == 2: links, to_investigate = cls._process_two_hashes( patid, pat_hashes, hash_uuid_lut, rules_cache, config, session, partner_code) return links, to_investigate
def _process_patient_row(cls, patid, pat_hashes, hash_uuid_lut, rules_cache, config, session, partner_code, skip_db_lookup): """ TODO: This function is not handling the case when we run the linkage for the same patient twice. :return a tuple of OrderedDicts (linkage_entities, sha_to_investigate) """ links = {} to_investigate = {} if len(pat_hashes) == 0: # create a link anyway uuid = utils.get_uuid() flag = FLAG_HASH_NOT_FOUND new_link = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(RULE_CODE_NO_HASH), linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=None, linkage_added_at=datetime.now()) links = {'': new_link} elif len(pat_hashes) == 1: # only one hash was received rule_code, ahash = pat_hashes.popitem() """ Note: there are multiple cases when the same hash is associated with 2 or 3 different patients from the same partner -- which means that we have to checking every link object in the LUT to avoid linking different patids even if they have same hash value. Example query: SELECT linkage_hash, count(*) cc FROM linkage WHERE linkage_flag = 2 -- FLAG_SKIP_MATCH and partner_code = 'xyz' GROUP BY linkage_hash HAVING COUNT(*) > 1 """ # list type existing_links = hash_uuid_lut.get(ahash) binary_hash = unhexlify(ahash.encode('utf-8')) if skip_db_lookup: # we detected two or more rows with same hash uuid = utils.get_uuid() flag = FLAG_SKIP_REPEATED elif len(existing_links) == 0: # the hash search did not find any records => create new UUID uuid = utils.get_uuid() flag = FLAG_HASH_NOT_FOUND else: # If we find a link with the same hash from the same source # we ignore it and mark it accordingly if LinkageEntity.needs_to_skip_match_for_partner( existing_links, partner_code, patid): uuid = utils.get_uuid() flag = FLAG_SKIP_MATCH else: # reuse the first existing UUID # TODO: check if the source of the link matters here? uuid = existing_links[0].linkage_uuid flag = FLAG_HASH_FOUND new_link = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code), # we need the rule_id here linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=binary_hash, linkage_added_at=datetime.now()) if rule_code == RULE_CODE_F_L_D_R: links = {ahash: new_link} else: links = {'': None, ahash: new_link} elif len(pat_hashes) == 2: links, to_investigate = cls._process_two_hashes( patid, pat_hashes, hash_uuid_lut, rules_cache, config, session, partner_code, skip_db_lookup) return links, to_investigate
def _process_two_hashes(cls, patid, pat_hashes, hash_uuid_lut, rules_cache, config, session, partner_code, skip_db_lookup): """ :param patid: string representing the patient processed :param hash_uuid_lut: a dictionary of links found in the database for every hash associated with the patid we are processing Note: a `1` in the comments below indicates that a hash is already in the database! We have to cover 2^2 + 1 = 5 cases: 1. h1 => 0, h2 => 0 - create new UUID and use for both rows 2. h1 => 0, h2 => 1 \ _ reuse a UUID 3. h1 => 1, h2 => 0 / 4. h1 => 1, h2 => 1 and UUIDs match - reuse a UUID and create two rows 5. h1 => 1, h2 => 1 and the corresponding UUIDs do NOT match - reuse the UUID but link only the first hash """ links = {} to_investigate = {} added_date = datetime.now() # TODO: This is ugly but we can make (if needed) # the logic work for "n" rules rule_code_1, ahash_1 = pat_hashes.popitem() rule_code_2, ahash_2 = pat_hashes.popitem() # The dictionary contains lists of links existing_links_1 = hash_uuid_lut.get(ahash_1) existing_links_2 = hash_uuid_lut.get(ahash_2) both_not_found = (len(existing_links_1) == 0 and len(existing_links_2) == 0) only_one_found = ( (len(existing_links_1) == 0 and len(existing_links_2) > 0) or (len(existing_links_1) > 0 and len(existing_links_2) == 0)) if both_not_found or skip_db_lookup: # create two links with a `fresh` UUID uuid = utils.get_uuid() flag = FLAG_SKIP_REPEATED if skip_db_lookup else FLAG_HASH_NOT_FOUND # noqa new_link_1 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_1), linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_1.encode('utf-8')), linkage_added_at=added_date) new_link_2 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_2), linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_2.encode('utf-8')), linkage_added_at=added_date) elif only_one_found: # reuse the existing UUID if len(existing_links_1) > 0: flag_2 = FLAG_HASH_NOT_FOUND # Two distinct patids with same hash from same partner are # considered different persons and get distinct UUIDs if skip_db_lookup: uuid = utils.get_uuid() flag_1 = FLAG_SKIP_REPEATED elif LinkageEntity.needs_to_skip_match_for_partner( existing_links_1, partner_code, patid): uuid = utils.get_uuid() flag_1 = FLAG_SKIP_MATCH else: uuid = existing_links_1[0].linkage_uuid flag_1 = FLAG_HASH_FOUND else: flag_1 = FLAG_HASH_NOT_FOUND if skip_db_lookup: uuid = utils.get_uuid() flag_2 = FLAG_SKIP_REPEATED if LinkageEntity.needs_to_skip_match_for_partner( existing_links_2, partner_code, patid): uuid = utils.get_uuid() flag_2 = FLAG_SKIP_MATCH else: uuid = existing_links_2[0].linkage_uuid flag_2 = FLAG_HASH_FOUND new_link_1 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_1), linkage_patid=patid, linkage_flag=flag_1, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_1.encode('utf-8')), linkage_added_at=added_date) new_link_2 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_2), linkage_patid=patid, linkage_flag=flag_2, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_2.encode('utf-8')), linkage_added_at=added_date) else: # both are found if LinkageEntity.needs_to_skip_match_for_partner( existing_links_1, partner_code, patid): uuid = utils.get_uuid() flag = FLAG_SKIP_MATCH else: uuid = existing_links_1[0].linkage_uuid flag = FLAG_HASH_FOUND new_link_1 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_1), linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_1.encode('utf-8')), linkage_added_at=added_date) distinct_uuids = LinkageEntity.get_unique_uuids( existing_links_1, existing_links_2) if 1 == len(distinct_uuids): # UUID's match - insert row for the second hash too new_link_2 = LinkageEntity.create( partner_code=partner_code, rule_id=rules_cache.get(rule_code_2), linkage_patid=patid, linkage_flag=flag, linkage_uuid=uuid, linkage_hash=unhexlify(ahash_2.encode('utf-8')), linkage_added_at=added_date) else: # the UUID's do not match - we need to investigate to_investigate = { ahash_2: [[lnk.linkage_uuid for lnk in existing_links_1] + [lnk.linkage_uuid for lnk in existing_links_2]] } cls.log.warning("Hashes of the patid [{}] are linked" " to two distinct UUIDs: {}." " We linked only the first hash!".format( patid, to_investigate)) return {ahash_1: new_link_1}, to_investigate links[ahash_1] = new_link_1 links[ahash_2] = new_link_2 return links, {}