Esempio n. 1
0
    def create_links(self):
        """ Verify we can store LINKAGE rows """
        cache = RuleEntity.get_rules_cache(self.session)
        added_date = datetime.now()
        ahash = binascii.unhexlify('2B2D67AED8D511E6A41AF45C898E9B67'.encode())
        rule_id = cache.get(RULE_CODE_F_L_D_R)
        uuid_list = [utils.get_uuid(), utils.get_uuid()]

        links = []

        for uuid in uuid_list:
            link = LinkageEntity.create(partner_code='UFH',
                                        rule_id=rule_id,
                                        linkage_patid='123',
                                        linkage_flag=0,
                                        linkage_uuid=uuid,
                                        linkage_hash=ahash,
                                        linkage_added_at=added_date)
            self.assertIsNotNone(link.id)
            links.append(link)
            print(link)

        # Search links matching a hash
        links_by_hash = self.session.query(LinkageEntity).filter_by(
            linkage_hash=ahash).all()
        self.assertIsNotNone(links_by_hash)
        self.assertTrue(len(links_by_hash) == 2)

        unique = LinkageEntity.get_unique_uuids(links, links_by_hash)
        self.assertTrue(len(unique) == 2)
        print(unique)
Esempio n. 2
0
    def create_links(self):
        """ Verify we can store LINKAGE rows """
        pers_uuid = utils.get_uuid()

        # For the real code we cun just copy the
        ahash = binascii.unhexlify('2B2D67AED8D511E6A41AF45C898E9B67'.encode())
        added_date = datetime.now()
        cache = RuleEntity.get_rules_cache(self.session)
        rule_id = cache.get(RULE_CODE_F_L_D_R)

        link = LinkageEntity.create(partner_code='UFH',
                                    rule_id=rule_id,
                                    linkage_patid='123',
                                    linkage_flag=0,
                                    linkage_uuid=pers_uuid,
                                    linkage_hash=ahash,
                                    linkage_added_at=added_date)
        self.assertIsNotNone(link.id)

        print(link)

        # Search links matching a hash -- should return at most one row
        links_by_hash = self.session.query(LinkageEntity).filter_by(
            linkage_hash=ahash).all()
        self.assertIsNotNone(links_by_hash)
Esempio n. 3
0
    def create_rules(self):
        """ Create rule rows """
        added_date = datetime.now()

        # with self.assertRaises(NoResultFound):
        #     self.session.query(RuleEntity).filter_by(id=1).one()

        rule_r = self.session.query(RuleEntity).filter_by(
            rule_code=RULE_CODE_F_L_D_R).one_or_none()
        rule_s = self.session.query(RuleEntity).filter_by(
            rule_code=RULE_CODE_F_L_D_S).one_or_none()
        rule_x = self.session.query(RuleEntity).filter_by(
            rule_code=RULE_CODE_NO_HASH).one_or_none()

        if rule_r is None:
            RuleEntity.create(rule_code=RULE_CODE_F_L_D_R,
                              rule_description='First Last DOB Race',
                              rule_added_at=added_date)

        if rule_s is None:
            RuleEntity.create(rule_code=RULE_CODE_F_L_D_S,
                              rule_description='First Last DOB Sex',
                              rule_added_at=added_date)

        if rule_x is None:
            RuleEntity.create(rule_code=RULE_CODE_NO_HASH,
                              rule_description='First Last DOB Sex',
                              rule_added_at=added_date)

        # self.assertEqual(2, rule.id)
        cache = RuleEntity.get_rules_cache(self.session)
        self.assertIsNotNone(cache)
        print(cache)
Esempio n. 4
0
    def _process_frame(cls, config, session, df_source, partner_code):
        """
        """
        # Init an empty frame and copy the patid from the source
        df = pd.DataFrame()
        df['PATID'] = df_source['PATID']
        investigations = []
        hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source)
        mapped_hashes = {ahash: link.linkage_uuid
                         for ahash, link in hash_uuid_lut.items()
                         if link is not None}
        rules_cache = RuleEntity.get_rules_cache(session)
        cls.log.debug("Found {} linked hashes in db: {}"
                      .format(len(mapped_hashes), mapped_hashes))

        # the rules are ordered by their importance
        rules = config['ENABLED_RULES'].values()
        patients_with_no_hashes = []

        for index, row in df_source.iterrows():
            patid = row['PATID']
            pat_hashes = {rule: row[rule] for rule in rules if row[rule] != ''}

            if len(pat_hashes) < 1:
                patients_with_no_hashes.append(patid)

            cls.log.debug("Parsing row for patient {} with {} hashes"
                          .format(patid, len(pat_hashes)))
            links, to_investigate = cls._process_patient_row(
                patid, pat_hashes.copy(), hash_uuid_lut,
                rules_cache, config, session,
                partner_code)
            cls.log.debug("Created {} links for patid: {}".format(len(links), patid))  # noqa

            if len(to_investigate) > 0:
                investigations.append(to_investigate)

            i = 0
            for ahash, link in links.items():
                i += 1
                df.loc[df['PATID'] == patid, 'UUID'] = (link.linkage_uuid
                                                        if link else '')
                df.loc[df['PATID'] == patid, "hash_{}".format(i)] = ahash

        cls.log.warning("{} out of {} patients are missing both hashes: {}"
                        .format(len(patients_with_no_hashes), len(df),
                                patients_with_no_hashes))
        return df, investigations
    def _process_frame(cls, config, session, df_source, partner_code):
        """
        """
        # Init an empty frame and copy the patid from the source
        df = pd.DataFrame()
        df['PATID'] = df_source['PATID']

        hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source)
        mapped_hashes = {
            ahash: link.linkage_uuid
            for ahash, link in hash_uuid_lut.items() if link is not None
        }
        rules_cache = RuleEntity.get_rules_cache(session)
        cls.log.debug("Found {} linked hashes in db: {}".format(
            len(mapped_hashes), mapped_hashes))

        # the rules are ordered by their importance
        rules = config['ENABLED_RULES'].values()

        # Create list of patients w/o hashes
        patients_with_no_hashes = [
            row['PATID'] for index, row in df_source.iterrows()
            if len({rule: row[rule]
                    for rule in rules if row[rule] != ''}) < 1
        ]

        #    TODO: Insert multiprocessing pool here
        pool = mp.Pool(processes=NUM_CPUS)

        df, investigations = cls._process_patient_row_main(
            cls, pool, df, df_source, rules, hash_uuid_lut, rules_cache,
            config, partner_code)
        cls.log.warning(
            "{} out of {} patients are missing both hashes: {}".format(
                len(patients_with_no_hashes), len(df),
                patients_with_no_hashes))
        return df, investigations
Esempio n. 6
0
    def _process_frame(cls, config, df_source, partner_code):
        """
        Process a fragment of the large file.

        Note: since the `session` object can't be pickled we
        create the session in every call (see commit bde49a90)

       .. seealso::
           :meth:`generate`

        """
        engine = db_utils.get_db_engine(config)
        session = db_utils.get_db_session(engine)
        # Init an empty frame and copy the patid from the source
        df = pd.DataFrame()
        df['PATID'] = df_source['PATID']
        investigations = []
        hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source)
        mapped_hashes = {ahash: link.linkage_uuid
                         for ahash, link in hash_uuid_lut.items()
                         if link is not None}
        rules_cache = RuleEntity.get_rules_cache(session)
        cls.log.debug("Found {} linked hashes in db: {}"
                      .format(len(mapped_hashes), mapped_hashes))

        # the rules are ordered by their importance
        rules = config['ENABLED_RULES'].values()
        patients_with_no_hashes = []

        for index, row in df_source.iterrows():
            patid = row['PATID']
            pat_hashes = {rule: row[rule] for rule in rules if row[rule] != ''}

            if len(pat_hashes) < 1:
                patients_with_no_hashes.append(patid)

            cls.log.debug("Parsing row for patient {} with {} hashes"
                          .format(patid, len(pat_hashes)))
            links, to_investigate = cls._process_patient_row(
                patid, pat_hashes.copy(), hash_uuid_lut,
                rules_cache, config, session,
                partner_code)
            cls.log.debug("Created {} links for patid: {}".format(len(links), patid))  # noqa

            if len(to_investigate) > 0:
                investigations.append(to_investigate)

            i = 0
            for ahash, link in links.items():
                i += 1
                df.loc[df['PATID'] == patid, 'UUID'] = (link.linkage_uuid
                                                        if link else '')
                df.loc[df['PATID'] == patid, "hash_{}".format(i)] = ahash

        len_missing_both = len(patients_with_no_hashes)

        if len_missing_both > 0:
            cls.log.warning("Patients with no hashes: {} (out of {}). {}"
                            .format(len_missing_both, len(df),
                                    patients_with_no_hashes))
        return df, investigations
Esempio n. 7
0
    def _process_frame(cls, config, session, df_source, partner_code):
        """
        """
        # Init an empty frame and copy the patid from the source
        df = pd.DataFrame()
        df['PATID'] = df_source['PATID']
        investigations = []
        hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source)
        mapped_hashes = {ahash: link.linkage_uuid
                         for ahash, link in hash_uuid_lut.items()
                         if link is not None}
        rules_cache = RuleEntity.get_rules_cache(session)
        cls.log.debug("Found {} linked hashes in db: {}"
                      .format(len(mapped_hashes), mapped_hashes))

        # the rules are ordered by their importance
        rules = config['ENABLED_RULES'].values()
        patients_with_no_hashes = []

        #    TODO: Insert multiprocessing pool here
        jobs = []
        frames = []
        pool = mp.Pool(processes=NUM_CPUS)

        #    TODO: take for idx,row... loop and create function to call with
        #        utils.apply_async and pass args
        for index, row in df_source.iterrows():
            patid = row['PATID']
            pat_hashes = {rule: row[rule] for rule in rules if row[rule] != ''}

            if len(pat_hashes) < 1:
                patients_with_no_hashes.append(patid)

            cls.log.debug("Parsing row for patient {} with {} hashes"
                          .format(patid, len(pat_hashes)))

            #    Extract results from a pool.apply_aync call using get()
            #    See:
            #    https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.AsyncResult
            job = utils.apply_async(pool,
                                    cls._process_patient_row,
                                    (patid, pat_hashes.copy(), hash_uuid_lut,
                                     rules_cache, config, partner_code))
            jobs.append(job)

        job_count = len(jobs)
        cls.log.info("Total multiproc jobs: {}".format(job_count))

        # collect the results
        for index, job in enumerate(jobs):
            try:
                frames.append(job.get())
                if index % 10 == 0:
                    cls.log.info("Got results for frame {} (out of {})"
                                 .format(index, job_count))
            except Exception as exc:
                cls.log.error("Job [{}] error: {}".format(index, exc))
                mp.get_log().error(traceback.format_exc())

        pool.close()
        pool.join()
        cls.log.info("Got all {} frames. Concatenating...".format(job_count))

        #    TODO: setup loop over frames to extract links and to_investigate
        #    Kludgy as we iterate again over df_source.iterrows() to extract PATID
        for frame, row in zip(frames, df_source.iterrows()):
            patid = row[1]['PATID']
            links = frame[0]
            to_investigate = frame[1]
            #    !OJO! Printing to console/writing to log can slow processes...
            #        Do we need to know the number of links created for ea patid?
            #        Is it possible to write this to the df and then to SQL table/csv file?
            cls.log.debug("Created {} links for patid: {}".format(len(links), patid))  # noqa

            if len(to_investigate) > 0:
                investigations.append(to_investigate)

            i = 0
            for ahash, link in links.items():
                i += 1
                df.loc[df['PATID'] == patid, 'UUID'] = (link.linkage_uuid
                                                        if link else '')
                df.loc[df['PATID'] == patid, "hash_{}".format(i)] = ahash

        job_count = len(jobs)
        cls.log.info("Total multiproc jobs: {}".format(job_count))

        cls.log.warning("{} out of {} patients are missing both hashes: {}"
                        .format(len(patients_with_no_hashes), len(df),
                                patients_with_no_hashes))
        return df, investigations