コード例 #1
0
    def create_links(self):
        """ Verify we can store LINKAGE rows """
        cache = RuleEntity.get_rules_cache(self.session)
        added_date = datetime.now()
        ahash = binascii.unhexlify('2B2D67AED8D511E6A41AF45C898E9B67'.encode())
        rule_id = cache.get(RULE_CODE_F_L_D_R)
        uuid_list = [utils.get_uuid(), utils.get_uuid()]

        links = []

        for uuid in uuid_list:
            link = LinkageEntity.create(partner_code='UFH',
                                        rule_id=rule_id,
                                        linkage_patid='123',
                                        linkage_flag=0,
                                        linkage_uuid=uuid,
                                        linkage_hash=ahash,
                                        linkage_added_at=added_date)
            self.assertIsNotNone(link.id)
            links.append(link)
            print(link)

        # Search links matching a hash
        links_by_hash = self.session.query(LinkageEntity).filter_by(
            linkage_hash=ahash).all()
        self.assertIsNotNone(links_by_hash)
        self.assertTrue(len(links_by_hash) == 2)

        unique = LinkageEntity.get_unique_uuids(links, links_by_hash)
        self.assertTrue(len(unique) == 2)
        print(unique)
コード例 #2
0
ファイル: test_models.py プロジェクト: yup111/onefl-deduper
    def create_links(self):
        """ Verify we can store LINKAGE rows """
        pers_uuid = utils.get_uuid()

        # For the real code we cun just copy the
        ahash = binascii.unhexlify('2B2D67AED8D511E6A41AF45C898E9B67'.encode())
        added_date = datetime.now()
        cache = RuleEntity.get_rules_cache(self.session)
        rule_id = cache.get(RULE_CODE_F_L_D_R)

        link = LinkageEntity.create(partner_code='UFH',
                                    rule_id=rule_id,
                                    linkage_patid='123',
                                    linkage_flag=0,
                                    linkage_uuid=pers_uuid,
                                    linkage_hash=ahash,
                                    linkage_added_at=added_date)
        self.assertIsNotNone(link.id)

        print(link)

        # Search links matching a hash -- should return at most one row
        links_by_hash = self.session.query(LinkageEntity).filter_by(
            linkage_hash=ahash).all()
        self.assertIsNotNone(links_by_hash)
コード例 #3
0
    def _process_patient_row(cls, patid, pat_hashes, hash_uuid_lut,
                             rules_cache, config, session,
                             partner_code):
        """
        TODO: This function is not handling the case when we run the linkage
        for the same patient twice.

        :return a tuple of OrderedDicts (linkage_entities, sha_to_investigate)
        """
        links = {}
        to_investigate = {}

        if len(pat_hashes) == 0:
            # create a link anyway
            uuid = utils.get_uuid()
            flag = FLAG_HASH_NOT_FOUND

            new_link = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(RULE_CODE_NO_HASH),
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=None,
                linkage_added_at=datetime.now())
            links = {'': new_link}

        elif len(pat_hashes) == 1:
            # only one hash was received
            rule_code, ahash = pat_hashes.popitem()
            """
            TODO: there are multiple cases when the same hash is associated
            with 2 or 3 different patients from the same partner --
            which means that storing and checking only the first link object
            in the LUT can result in linking of ambiguous hashes.

            Helper query:
                select
                    linkage_hash, count(*) cc
                from
                    linkage
                where
                    linkage_flag = 2  -- FLAG_SKIP_MATCH
                    and partner_code = 'xyz'
                group by linkage_hash
                having
                    count(*) > 1
            """
            existing_link = hash_uuid_lut.get(ahash)
            binary_hash = unhexlify(ahash.encode('utf-8'))

            if existing_link is None:
                # create new UUID
                uuid = utils.get_uuid()
                flag = FLAG_HASH_NOT_FOUND
            else:
                # If we find a link with the same hash from the same source
                # we ignore it and mark it accordingly
                if existing_link.needs_to_skip_match_for_partner(partner_code):
                    uuid = utils.get_uuid()
                    flag = FLAG_SKIP_MATCH
                else:
                    # reuse the existing UUID
                    uuid = existing_link.linkage_uuid
                    flag = FLAG_HASH_FOUND

            new_link = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code),  # we need the rule_id here
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=binary_hash,
                linkage_added_at=datetime.now())

            if rule_code == RULE_CODE_F_L_D_R:
                links = {ahash: new_link}
            else:
                links = {'': None, ahash: new_link}

        elif len(pat_hashes) == 2:
            links, to_investigate = cls._process_two_hashes(
                patid, pat_hashes, hash_uuid_lut,
                rules_cache, config, session,
                partner_code)
        return links, to_investigate
コード例 #4
0
    def _process_two_hashes(cls, patid, pat_hashes, hash_uuid_lut,
                            rules_cache, config, session,
                            partner_code):
        """
        Note: a `1` in the comments below indicates that a hash is already in
            the database!

        We have to cover 2^2 + 1 = 5 cases:
            1. h1 => 0, h2 => 0     - create new UUID and use for both rows
            2. h1 => 0, h2 => 1  \  _ reuse a UUID
            3. h1 => 1, h2 => 0  /
            4. h1 => 1, h2 => 1 and UUIDs match
                - reuse a UUID and create two rows
            5. h1 => 1, h2 => 1 and the corresponding UUIDs do NOT match
                - reuse the UUID but link only the first hash
        """
        links = {}
        to_investigate = {}
        added_date = datetime.now()

        # TODO: This is ugly but we can make (if needed)
        # the logic work for "n" rules
        rule_code_1, ahash_1 = pat_hashes.popitem()
        rule_code_2, ahash_2 = pat_hashes.popitem()
        existing_link_1 = hash_uuid_lut.get(ahash_1)
        existing_link_2 = hash_uuid_lut.get(ahash_2)

        both_not_found = existing_link_1 is None and existing_link_2 is None
        only_one_found = ((existing_link_1 is None and
                           existing_link_2 is not None) or
                          (existing_link_1 is not None and
                           existing_link_2 is None))

        if both_not_found:
            # create two links with a `fresh` UUID
            uuid = utils.get_uuid()
            flag_1 = FLAG_HASH_NOT_FOUND
            flag_2 = FLAG_HASH_NOT_FOUND

            new_link_1 = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code_1),
                linkage_patid=patid,
                linkage_flag=flag_1,
                linkage_uuid=uuid,
                linkage_hash=unhexlify(ahash_1.encode('utf-8')),
                linkage_added_at=added_date)

            new_link_2 = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code_2),
                linkage_patid=patid,
                linkage_flag=flag_2,
                linkage_uuid=uuid,
                linkage_hash=unhexlify(ahash_2.encode('utf-8')),
                linkage_added_at=added_date)

        elif only_one_found:
            # reuse the existing UUID
            if existing_link_1 is not None:
                flag_2 = FLAG_HASH_NOT_FOUND

                # TODO: verify the logic:
                #   "two distinct patids with same hash from same partner are
                #   considered different persons"
                if existing_link_1.needs_to_skip_match_for_partner(partner_code):  # noqa
                    uuid = utils.get_uuid()
                    flag_1 = FLAG_SKIP_MATCH
                else:
                    uuid = existing_link_1.linkage_uuid
                    flag_1 = FLAG_HASH_FOUND
            else:
                flag_1 = FLAG_HASH_NOT_FOUND

                if existing_link_2.needs_to_skip_match_for_partner(partner_code):  # noqa
                    uuid = utils.get_uuid()
                    flag_2 = FLAG_SKIP_MATCH
                else:
                    uuid = existing_link_2.linkage_uuid
                    flag_2 = FLAG_HASH_FOUND

            new_link_1 = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code_1),
                linkage_patid=patid,
                linkage_flag=flag_1,
                linkage_uuid=uuid,
                linkage_hash=unhexlify(ahash_1.encode('utf-8')),
                linkage_added_at=added_date)

            new_link_2 = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code_2),
                linkage_patid=patid,
                linkage_flag=flag_2,
                linkage_uuid=uuid,
                linkage_hash=unhexlify(ahash_2.encode('utf-8')),
                linkage_added_at=added_date)
        else:
            # both are found
            if existing_link_1.needs_to_skip_match_for_partner(partner_code):
                uuid = utils.get_uuid()
                flag = FLAG_SKIP_MATCH
            else:
                uuid = existing_link_1.linkage_uuid
                flag = FLAG_HASH_FOUND

            new_link_1 = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code_1),
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=unhexlify(ahash_1.encode('utf-8')),
                linkage_added_at=added_date)

            # UUID's match - insert row for the second hash too
            if (existing_link_1.linkage_uuid ==
                    existing_link_2.linkage_uuid):
                # consensus hence insert row for hash 2 too
                new_link_2 = LinkageEntity.create(
                    partner_code=partner_code,
                    rule_id=rules_cache.get(rule_code_2),
                    linkage_patid=patid,
                    linkage_flag=flag,
                    linkage_uuid=uuid,
                    linkage_hash=unhexlify(ahash_2.encode('utf-8')),
                    linkage_added_at=added_date)
            else:
                # the UUID's do not match - we need to investigate
                to_investigate = {
                    ahash_2: [existing_link_1.linkage_uuid,
                              existing_link_2.linkage_uuid]
                }
                cls.log.warning("Hashes of the patid [{}] are linked"
                                " to two distinct UUIDs: {}, {}."
                                " We linked only the first hash!"
                                .format(patid,
                                        existing_link_1.linkage_uuid,
                                        existing_link_2.linkage_uuid))
                return {ahash_1: new_link_1}, to_investigate

        links[ahash_1] = new_link_1
        links[ahash_2] = new_link_2

        return links, {}
コード例 #5
0
    def _process_patient_row(cls, patid, pat_hashes, hash_uuid_lut,
                             rules_cache, config, partner_code):
        """
        :return OrderedDict: with the newly created linkage entities
        """
        #    TODO: need to create engine and session within for...loop
        #        to take advantage of multi-core
        session = db.get_db_session(db.get_db_engine(config),
                                    create_tables=True)

        cls.log.debug("Parsing row for patient {} with {} hashes".format(
            patid, len(pat_hashes)))

        links = {}
        to_investigate = {}

        if len(pat_hashes) == 0:
            cls.log.warn("Patient [{}] does not have any hashes".format(patid))
            # create a link anyway
            uuid = utils.get_uuid()
            flag = FLAG_HASH_NOT_FOUND

            new_link = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(RULE_CODE_NO_HASH),
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=None,
                linkage_added_at=datetime.now())
            # TODO: add code to return this special link

        elif len(pat_hashes) == 1:
            # only one hash was received
            rule_code, ahash = pat_hashes.popitem()
            existing_link = hash_uuid_lut.get(ahash)
            binary_hash = unhexlify(ahash.encode('utf-8'))

            if existing_link is None:
                # create new UUID
                uuid = utils.get_uuid()
                flag = FLAG_HASH_NOT_FOUND
            else:
                # reuse the existing UUID
                uuid = existing_link.linkage_uuid
                flag = FLAG_HASH_FOUND

            new_link = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code),  # we need the rule_id here
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=binary_hash,
                linkage_added_at=datetime.now())

            if rule_code == RULE_CODE_F_L_D_R:
                links = {ahash: new_link}
            else:
                links = {'': None, ahash: new_link}

        elif len(pat_hashes) == 2:
            links, to_investigate = cls._process_two_hashes(
                patid, pat_hashes, hash_uuid_lut, rules_cache, config, session,
                partner_code)
        return links, to_investigate
コード例 #6
0
ファイル: link_generator.py プロジェクト: ufbmi/onefl-deduper
    def _process_patient_row(cls, patid, pat_hashes, hash_uuid_lut,
                             rules_cache, config, session, partner_code,
                             skip_db_lookup):
        """
        TODO: This function is not handling the case when we run the linkage
        for the same patient twice.

        :return a tuple of OrderedDicts (linkage_entities, sha_to_investigate)
        """
        links = {}
        to_investigate = {}

        if len(pat_hashes) == 0:
            # create a link anyway
            uuid = utils.get_uuid()
            flag = FLAG_HASH_NOT_FOUND

            new_link = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(RULE_CODE_NO_HASH),
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=None,
                linkage_added_at=datetime.now())
            links = {'': new_link}

        elif len(pat_hashes) == 1:
            # only one hash was received
            rule_code, ahash = pat_hashes.popitem()
            """
            Note: there are multiple cases when the same hash is associated
            with 2 or 3 different patients from the same partner --
            which means that we have to checking every link object
            in the LUT to avoid linking different patids even if they have
            same hash value.

            Example query:
                SELECT
                    linkage_hash, count(*) cc
                FROM
                    linkage
                WHERE
                    linkage_flag = 2  -- FLAG_SKIP_MATCH
                    and partner_code = 'xyz'
                GROUP BY
                    linkage_hash
                HAVING
                    COUNT(*) > 1
            """
            # list type
            existing_links = hash_uuid_lut.get(ahash)
            binary_hash = unhexlify(ahash.encode('utf-8'))

            if skip_db_lookup:
                # we detected two or more rows with same hash
                uuid = utils.get_uuid()
                flag = FLAG_SKIP_REPEATED
            elif len(existing_links) == 0:
                # the hash search did not find any records => create new UUID
                uuid = utils.get_uuid()
                flag = FLAG_HASH_NOT_FOUND
            else:
                # If we find a link with the same hash from the same source
                # we ignore it and mark it accordingly
                if LinkageEntity.needs_to_skip_match_for_partner(
                        existing_links, partner_code, patid):
                    uuid = utils.get_uuid()
                    flag = FLAG_SKIP_MATCH
                else:
                    # reuse the first existing UUID
                    # TODO: check if the source of the link matters here?
                    uuid = existing_links[0].linkage_uuid
                    flag = FLAG_HASH_FOUND

            new_link = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code),  # we need the rule_id here
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=binary_hash,
                linkage_added_at=datetime.now())

            if rule_code == RULE_CODE_F_L_D_R:
                links = {ahash: new_link}
            else:
                links = {'': None, ahash: new_link}

        elif len(pat_hashes) == 2:
            links, to_investigate = cls._process_two_hashes(
                patid, pat_hashes, hash_uuid_lut, rules_cache, config, session,
                partner_code, skip_db_lookup)
        return links, to_investigate
コード例 #7
0
ファイル: link_generator.py プロジェクト: ufbmi/onefl-deduper
    def _process_two_hashes(cls, patid, pat_hashes, hash_uuid_lut, rules_cache,
                            config, session, partner_code, skip_db_lookup):
        """
        :param patid: string representing the patient processed
        :param hash_uuid_lut: a dictionary of links found in the database
            for every hash associated with the patid we are processing

        Note: a `1` in the comments below indicates that a hash is already in
            the database!

        We have to cover 2^2 + 1 = 5 cases:
            1. h1 => 0, h2 => 0     - create new UUID and use for both rows
            2. h1 => 0, h2 => 1  \  _ reuse a UUID
            3. h1 => 1, h2 => 0  /
            4. h1 => 1, h2 => 1 and UUIDs match
                - reuse a UUID and create two rows
            5. h1 => 1, h2 => 1 and the corresponding UUIDs do NOT match
                - reuse the UUID but link only the first hash
        """
        links = {}
        to_investigate = {}
        added_date = datetime.now()

        # TODO: This is ugly but we can make (if needed)
        # the logic work for "n" rules
        rule_code_1, ahash_1 = pat_hashes.popitem()
        rule_code_2, ahash_2 = pat_hashes.popitem()

        # The dictionary contains lists of links
        existing_links_1 = hash_uuid_lut.get(ahash_1)
        existing_links_2 = hash_uuid_lut.get(ahash_2)

        both_not_found = (len(existing_links_1) == 0
                          and len(existing_links_2) == 0)
        only_one_found = (
            (len(existing_links_1) == 0 and len(existing_links_2) > 0)
            or (len(existing_links_1) > 0 and len(existing_links_2) == 0))

        if both_not_found or skip_db_lookup:
            # create two links with a `fresh` UUID
            uuid = utils.get_uuid()
            flag = FLAG_SKIP_REPEATED if skip_db_lookup else FLAG_HASH_NOT_FOUND  # noqa

            new_link_1 = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code_1),
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=unhexlify(ahash_1.encode('utf-8')),
                linkage_added_at=added_date)

            new_link_2 = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code_2),
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=unhexlify(ahash_2.encode('utf-8')),
                linkage_added_at=added_date)

        elif only_one_found:
            # reuse the existing UUID
            if len(existing_links_1) > 0:
                flag_2 = FLAG_HASH_NOT_FOUND

                # Two distinct patids with same hash from same partner are
                # considered different persons and get distinct UUIDs

                if skip_db_lookup:
                    uuid = utils.get_uuid()
                    flag_1 = FLAG_SKIP_REPEATED
                elif LinkageEntity.needs_to_skip_match_for_partner(
                        existing_links_1, partner_code, patid):
                    uuid = utils.get_uuid()
                    flag_1 = FLAG_SKIP_MATCH
                else:
                    uuid = existing_links_1[0].linkage_uuid
                    flag_1 = FLAG_HASH_FOUND
            else:
                flag_1 = FLAG_HASH_NOT_FOUND

                if skip_db_lookup:
                    uuid = utils.get_uuid()
                    flag_2 = FLAG_SKIP_REPEATED
                if LinkageEntity.needs_to_skip_match_for_partner(
                        existing_links_2, partner_code, patid):
                    uuid = utils.get_uuid()
                    flag_2 = FLAG_SKIP_MATCH
                else:
                    uuid = existing_links_2[0].linkage_uuid
                    flag_2 = FLAG_HASH_FOUND

            new_link_1 = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code_1),
                linkage_patid=patid,
                linkage_flag=flag_1,
                linkage_uuid=uuid,
                linkage_hash=unhexlify(ahash_1.encode('utf-8')),
                linkage_added_at=added_date)

            new_link_2 = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code_2),
                linkage_patid=patid,
                linkage_flag=flag_2,
                linkage_uuid=uuid,
                linkage_hash=unhexlify(ahash_2.encode('utf-8')),
                linkage_added_at=added_date)
        else:
            # both are found
            if LinkageEntity.needs_to_skip_match_for_partner(
                    existing_links_1, partner_code, patid):
                uuid = utils.get_uuid()
                flag = FLAG_SKIP_MATCH
            else:
                uuid = existing_links_1[0].linkage_uuid
                flag = FLAG_HASH_FOUND

            new_link_1 = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code_1),
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=unhexlify(ahash_1.encode('utf-8')),
                linkage_added_at=added_date)

            distinct_uuids = LinkageEntity.get_unique_uuids(
                existing_links_1, existing_links_2)

            if 1 == len(distinct_uuids):
                # UUID's match - insert row for the second hash too
                new_link_2 = LinkageEntity.create(
                    partner_code=partner_code,
                    rule_id=rules_cache.get(rule_code_2),
                    linkage_patid=patid,
                    linkage_flag=flag,
                    linkage_uuid=uuid,
                    linkage_hash=unhexlify(ahash_2.encode('utf-8')),
                    linkage_added_at=added_date)
            else:
                # the UUID's do not match - we need to investigate
                to_investigate = {
                    ahash_2: [[lnk.linkage_uuid for lnk in existing_links_1] +
                              [lnk.linkage_uuid for lnk in existing_links_2]]
                }
                cls.log.warning("Hashes of the patid [{}] are linked"
                                " to two distinct UUIDs: {}."
                                " We linked only the first hash!".format(
                                    patid, to_investigate))
                return {ahash_1: new_link_1}, to_investigate

        links[ahash_1] = new_link_1
        links[ahash_2] = new_link_2

        return links, {}