コード例 #1
0
class NAACCR_Ontology2(_RunScriptTask):
    '''NAACCR Ontology: un-published code values

    i.e. code values that occur in tumor registry data but not in published ontologies
    '''
    # flat file attributes
    dateCaseReportExported = pv.DateParam()
    npiRegistryId = pv.StrParam()
    source_cd = pv.StrParam(default='*****@*****.**')

    z_design_id = pv.StrParam(default='length 50 (%s)' % _stable_hash(
        tr_ont.NAACCR_I2B2.ont_script.code,
        td.DataSummary.script.code))

    script_name = 'naaccr_concepts_mix.sql'
    script = res.read_text(heron_load, script_name)

    @property
    def classpath(self) -> str:
        return self.jdbc_driver_jar

    def _upload_target(self) -> 'UploadTarget':
        return UploadTarget(self, self.schema, transform_name=self.task_id)

    def requires(self) -> Dict[str, luigi.Task]:
        _configure_logging(self.log_dest)

        summary = NAACCR_Summary(
            db_url=self.db_url,
            user=self.user,
            passkey=self.passkey,
            dateCaseReportExported=self.dateCaseReportExported,
            npiRegistryId=self.npiRegistryId,
        )
        ont1 = NAACCR_Ontology1(
            db_url=self.db_url,
            user=self.user,
            passkey=self.passkey,
        )

        return dict(NAACCR_Ontology1=ont1,
                    NAACCR_Summary=summary)

    def run_upload(self, conn: Connection, upload_id: int) -> None:
        self.run_script(
            conn, self.script_name, self.script,
            variables=dict(upload_id=str(upload_id),
                           task_id=self.task_id),
            script_params=dict(
                upload_id=upload_id,
                task_id=self.task_id,
                source_cd=self.source_cd,
                update_date=self.dateCaseReportExported))
コード例 #2
0
class _NAACCR_JDBC(SparkJDBCTask):
    """Load data from a NAACCR flat file into a table via JDBC.

    Use a `task_id` column to manage freshness.
    """
    table_name: str
    dateCaseReportExported = pv.DateParam()
    npiRegistryId = pv.StrParam()

    def requires(self) -> Dict[str, luigi.Task]:
        return {
            'NAACCR_FlatFile':
            NAACCR_FlatFile(dateCaseReportExported=self.dateCaseReportExported,
                            npiRegistryId=self.npiRegistryId)
        }

    def _flat_file_task(self) -> NAACCR_FlatFile:
        return cast(NAACCR_FlatFile, self.requires()['NAACCR_FlatFile'])

    def output(self) -> luigi.Target:
        query = f"""
          (select 1 from {self.table_name}
           where task_id = '{self.task_id}')
        """
        return JDBCTableTarget(self, query)

    def main_action(self, sparkContext: SparkContext_T) -> None:
        quiet_logs(sparkContext)
        spark = SparkSession(sparkContext)
        ff = self._flat_file_task()
        naaccr_text_lines = spark.read.text(str(ff.flat_file))

        data = self._data(spark, naaccr_text_lines)
        # ISSUE: task_id is kinda long; how about just task_hash?
        # luigi_task_hash?
        data = data.withColumn('task_id', func.lit(self.task_id))
        data = td.case_fold(data)
        self.account().wr(data.write, self.table_name, mode='overwrite')

    def _data(self, spark: SparkSession,
              naaccr_text_lines: DataFrame) -> DataFrame:
        raise NotImplementedError('subclass must implement')
コード例 #3
0
class NAACCR_Load(_RunScriptTask):
    '''Map and load NAACCR patients, tumors / visits, and facts.
    '''
    # flat file attributes
    dateCaseReportExported = pv.DateParam()
    npiRegistryId = pv.StrParam()

    # encounter mapping
    encounter_ide_source = pv.StrParam(default='*****@*****.**')
    project_id = pv.StrParam(default='BlueHeron')
    source_cd = pv.StrParam(default='*****@*****.**')

    # ISSUE: task_id should depend on dest schema / owner.
    z_design_id = pv.StrParam(default='nested fields')

    script_name = 'naaccr_facts_load.sql'
    script_deid_name = 'i2b2_facts_deid.sql'
    script = res.read_text(heron_load, script_name)
    script_deid = res.read_text(heron_load, script_deid_name)

    @property
    def classpath(self) -> str:
        return self.jdbc_driver_jar

    def _upload_target(self) -> 'UploadTarget':
        return UploadTarget(self, self.schema, transform_name=self.task_id)

    def requires(self) -> Dict[str, luigi.Task]:
        _configure_logging(self.log_dest)

        ff = NAACCR_FlatFile(
            dateCaseReportExported=self.dateCaseReportExported,
            npiRegistryId=self.npiRegistryId)

        parts = {
            cls.__name__:
            cls(db_url=self.db_url,
                user=self.user,
                passkey=self.passkey,
                dateCaseReportExported=self.dateCaseReportExported,
                npiRegistryId=self.npiRegistryId)
            for cls in [NAACCR_Patients, NAACCR_Visits, NAACCR_Facts]
        }
        return dict(parts, NAACCR_FlatFile=ff)

    def _flat_file_task(self) -> NAACCR_FlatFile:
        return cast(NAACCR_FlatFile, self.requires()['NAACCR_FlatFile'])

    def _patients_task(self) -> NAACCR_Patients:
        return cast(NAACCR_Patients, self.requires()['NAACCR_Patients'])

    def run_upload(self, conn: Connection, upload_id: int) -> None:
        ff = self._flat_file_task()
        pat = self._patients_task()

        # ISSUE: split these into separate tasks?
        for name, script in [(self.script_name, self.script),
                             (self.script_deid_name, self.script_deid)]:
            self.run_script(
                conn,
                name,
                script,
                variables=dict(upload_id=str(upload_id), task_id=self.task_id),
                script_params=dict(
                    upload_id=upload_id,
                    project_id=self.project_id,
                    task_id=self.task_id,
                    source_cd=self.source_cd,
                    download_date=ff.dateCaseReportExported,
                    patient_ide_source=pat.patient_ide_source,
                    encounter_ide_source=self.encounter_ide_source))
コード例 #4
0
class NAACCR_FlatFile(ManualTask):
    """A NAACCR flat file is determined by the registry, export date,
    and version.
    """
    naaccrRecordVersion = pv.IntParam(default=180)
    dateCaseReportExported = pv.DateParam()
    npiRegistryId = pv.StrParam()
    testData = pv.BoolParam(default=False, significant=False)
    flat_file = pv.PathParam(significant=False)
    record_qty_min = pv.IntParam(significant=False, default=1)

    def check_version_param(self) -> None:
        """Only version 18 (180) is currently supported.
        """
        if self.naaccrRecordVersion != 180:
            raise NotImplementedError()

    def complete(self) -> bool:
        with task_action(self, 'complete') as ctx:
            result = self.complete_action()
            ctx.add_success_fields(result=result)
            return result

    def complete_action(self) -> bool:
        """Check the first record, assuming all the others have
        the same export date and registry NPI.
        """
        self.check_version_param()

        with self.flat_file.open() as records:
            record0 = records.readline()
            qty = 1 + sum(1 for _ in records.readlines())
        log.info('record qty: %d (> %d? %s)', qty, self.record_qty_min,
                 qty >= self.record_qty_min)

        vOk = self._checkItem(record0, 'naaccrRecordVersion',
                              str(self.naaccrRecordVersion))
        regOk = self._checkItem(record0, 'npiRegistryId', self.npiRegistryId)
        dtOk = self._checkItem(record0, 'dateCaseReportExported',
                               self.dateCaseReportExported.strftime('%Y%m%d'))

        if vOk and regOk and dtOk and qty >= self.record_qty_min:
            return True
        else:
            if self.testData:
                log.warn('ignoring failed FlatFile check')
                return True
            return False

    @classmethod
    def _checkItem(cls, record: str, naaccrId: str, expected: str) -> bool:
        '''
        >>> npi = '1234567890'
        >>> record0 = ' ' * 19 + npi
        >>> NAACCR_FlatFile._checkItem(record0, 'npiRegistryId', npi)
        True
        >>> NAACCR_FlatFile._checkItem(record0, 'npiRegistryId', 'XXX')
        False
        '''
        itemDef = tr_ont.NAACCR1.itemDef(naaccrId)
        [startColumn, length
         ] = [int(itemDef.attrib[it]) for it in ['startColumn', 'length']]
        startColumn -= 1
        actual = record[startColumn:startColumn + length]
        if actual != expected:
            log.warn('%s: expected %s [%s:%s] = {%s} but found {%s}',
                     cls.__name__, naaccrId, startColumn - 1,
                     startColumn + length, expected, actual)
        return actual == expected