コード例 #1
0
ファイル: base_test.py プロジェクト: yup111/onefl-deduper
    def setUp(self):
        """ Set the test database engine/session """

        super(BaseTestCase, self).setUp()
        config = Config(root_path='.', defaults={})
        config.from_pyfile(SETTINGS_FILE)
        self.config = config
        self.engine = db.get_db_engine(config)
        self.session = db.get_db_session(self.engine, create_tables=True)
        self.create_rules()
        self.create_partners()
コード例 #2
0
    def _process_frame(cls, config, df_source, partner_code):
        """
        Process a fragment of the large file.

        Note: since the `session` object can't be pickled we
        create the session in every call (see commit bde49a90)

       .. seealso::
           :meth:`generate`

        """
        engine = db_utils.get_db_engine(config)
        session = db_utils.get_db_session(engine)
        # Init an empty frame and copy the patid from the source
        df = pd.DataFrame()
        df['PATID'] = df_source['PATID']
        investigations = []
        hash_uuid_lut = cls._populate_hash_uuid_lut(config, session, df_source)
        mapped_hashes = {ahash: link.linkage_uuid
                         for ahash, link in hash_uuid_lut.items()
                         if link is not None}
        rules_cache = RuleEntity.get_rules_cache(session)
        cls.log.debug("Found {} linked hashes in db: {}"
                      .format(len(mapped_hashes), mapped_hashes))

        # the rules are ordered by their importance
        rules = config['ENABLED_RULES'].values()
        patients_with_no_hashes = []

        for index, row in df_source.iterrows():
            patid = row['PATID']
            pat_hashes = {rule: row[rule] for rule in rules if row[rule] != ''}

            if len(pat_hashes) < 1:
                patients_with_no_hashes.append(patid)

            cls.log.debug("Parsing row for patient {} with {} hashes"
                          .format(patid, len(pat_hashes)))
            links, to_investigate = cls._process_patient_row(
                patid, pat_hashes.copy(), hash_uuid_lut,
                rules_cache, config, session,
                partner_code)
            cls.log.debug("Created {} links for patid: {}".format(len(links), patid))  # noqa

            if len(to_investigate) > 0:
                investigations.append(to_investigate)

            i = 0
            for ahash, link in links.items():
                i += 1
                df.loc[df['PATID'] == patid, 'UUID'] = (link.linkage_uuid
                                                        if link else '')
                df.loc[df['PATID'] == patid, "hash_{}".format(i)] = ahash

        len_missing_both = len(patients_with_no_hashes)

        if len_missing_both > 0:
            cls.log.warning("Patients with no hashes: {} (out of {}). {}"
                            .format(len_missing_both, len(df),
                                    patients_with_no_hashes))
        return df, investigations
コード例 #3
0
    def _process_patient_row(cls, patid, pat_hashes, hash_uuid_lut,
                             rules_cache, config, partner_code):
        """
        :return OrderedDict: with the newly created linkage entities
        """
        #    TODO: need to create engine and session within for...loop
        #        to take advantage of multi-core
        session = db.get_db_session(db.get_db_engine(config),
                                    create_tables=True)

        cls.log.debug("Parsing row for patient {} with {} hashes".format(
            patid, len(pat_hashes)))

        links = {}
        to_investigate = {}

        if len(pat_hashes) == 0:
            cls.log.warn("Patient [{}] does not have any hashes".format(patid))
            # create a link anyway
            uuid = utils.get_uuid()
            flag = FLAG_HASH_NOT_FOUND

            new_link = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(RULE_CODE_NO_HASH),
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=None,
                linkage_added_at=datetime.now())
            # TODO: add code to return this special link

        elif len(pat_hashes) == 1:
            # only one hash was received
            rule_code, ahash = pat_hashes.popitem()
            existing_link = hash_uuid_lut.get(ahash)
            binary_hash = unhexlify(ahash.encode('utf-8'))

            if existing_link is None:
                # create new UUID
                uuid = utils.get_uuid()
                flag = FLAG_HASH_NOT_FOUND
            else:
                # reuse the existing UUID
                uuid = existing_link.linkage_uuid
                flag = FLAG_HASH_FOUND

            new_link = LinkageEntity.create(
                partner_code=partner_code,
                rule_id=rules_cache.get(rule_code),  # we need the rule_id here
                linkage_patid=patid,
                linkage_flag=flag,
                linkage_uuid=uuid,
                linkage_hash=binary_hash,
                linkage_added_at=datetime.now())

            if rule_code == RULE_CODE_F_L_D_R:
                links = {ahash: new_link}
            else:
                links = {'': None, ahash: new_link}

        elif len(pat_hashes) == 2:
            links, to_investigate = cls._process_two_hashes(
                patid, pat_hashes, hash_uuid_lut, rules_cache, config, session,
                partner_code)
        return links, to_investigate
コード例 #4
0
    def generate(cls, config, inputdir, outputdir, partner):
        """
        Read the "phi_hashes.csv" file and generate UUID's.

        Optionally save the "hash -> UUID" mapping to "links.csv"
        """
        cls._validate_config(config)
        engine = db.get_db_engine(config)

        # pass a session object to avoid creating it in the loop
        # TODO: add a method parameter for controlling the `create_table` flag
        session = db.get_db_session(engine, create_tables=True)

        EXPECTED_COLS = config['EXPECTED_COLS']
        SAVE_OUT_FILE = config['SAVE_OUT_FILE']
        in_file = os.path.join(inputdir, config['IN_FILE'])

        reader = None

        try:
            reader = pd.read_csv(in_file,
                                 sep=config['IN_DELIMITER'],
                                 dtype=object,
                                 skipinitialspace=True,
                                 skip_blank_lines=True,
                                 usecols=list(EXPECTED_COLS),
                                 chunksize=config['LINES_PER_CHUNK'],
                                 iterator=True)
            cls.log.info("Reading data from file: {} ({})".format(
                in_file, utils.get_file_size(in_file)))

        except ValueError as exc:
            cls.log.info("Please check if the actual column names"
                         " in [{}] match the expected column names"
                         " file: {}.".format(in_file, sorted(EXPECTED_COLS)))
            cls.log.error("Error: {}".format(exc))

        frames = []
        investigations = []

        for df_source in reader:
            df_source.fillna('', inplace=True)
            # The magic happens here...
            df, to_investigate = cls._process_frame(config, session, df_source,
                                                    partner)
            if SAVE_OUT_FILE:
                frames.append(df)
                investigations.extend(to_investigate)

        if SAVE_OUT_FILE:
            df = pd.concat(frames, ignore_index=True)
            out_file = os.path.join(outputdir, config['OUT_FILE'])
            out_file_investigate = os.path.join(outputdir,
                                                config['OUT_FILE_INVESTIGATE'])
            utils.frame_to_file(df,
                                out_file,
                                delimiter=config['OUT_DELIMITER'])
            cls.log.info("Wrote output file: {} ({} data rows, {})".format(
                out_file, len(df), utils.get_file_size(out_file)))

            with open(out_file_investigate, 'w') as invf:
                for line in investigations:
                    invf.write("{}\n".format(line))
            cls.log.info("Wrote hashes that need investigation to {}".format(
                out_file_investigate))
        return True