class NAACCR_Ontology2(_RunScriptTask): '''NAACCR Ontology: un-published code values i.e. code values that occur in tumor registry data but not in published ontologies ''' # flat file attributes dateCaseReportExported = pv.DateParam() npiRegistryId = pv.StrParam() source_cd = pv.StrParam(default='*****@*****.**') z_design_id = pv.StrParam(default='length 50 (%s)' % _stable_hash( tr_ont.NAACCR_I2B2.ont_script.code, td.DataSummary.script.code)) script_name = 'naaccr_concepts_mix.sql' script = res.read_text(heron_load, script_name) @property def classpath(self) -> str: return self.jdbc_driver_jar def _upload_target(self) -> 'UploadTarget': return UploadTarget(self, self.schema, transform_name=self.task_id) def requires(self) -> Dict[str, luigi.Task]: _configure_logging(self.log_dest) summary = NAACCR_Summary( db_url=self.db_url, user=self.user, passkey=self.passkey, dateCaseReportExported=self.dateCaseReportExported, npiRegistryId=self.npiRegistryId, ) ont1 = NAACCR_Ontology1( db_url=self.db_url, user=self.user, passkey=self.passkey, ) return dict(NAACCR_Ontology1=ont1, NAACCR_Summary=summary) def run_upload(self, conn: Connection, upload_id: int) -> None: self.run_script( conn, self.script_name, self.script, variables=dict(upload_id=str(upload_id), task_id=self.task_id), script_params=dict( upload_id=upload_id, task_id=self.task_id, source_cd=self.source_cd, update_date=self.dateCaseReportExported))
class _NAACCR_JDBC(SparkJDBCTask): """Load data from a NAACCR flat file into a table via JDBC. Use a `task_id` column to manage freshness. """ table_name: str dateCaseReportExported = pv.DateParam() npiRegistryId = pv.StrParam() def requires(self) -> Dict[str, luigi.Task]: return { 'NAACCR_FlatFile': NAACCR_FlatFile(dateCaseReportExported=self.dateCaseReportExported, npiRegistryId=self.npiRegistryId) } def _flat_file_task(self) -> NAACCR_FlatFile: return cast(NAACCR_FlatFile, self.requires()['NAACCR_FlatFile']) def output(self) -> luigi.Target: query = f""" (select 1 from {self.table_name} where task_id = '{self.task_id}') """ return JDBCTableTarget(self, query) def main_action(self, sparkContext: SparkContext_T) -> None: quiet_logs(sparkContext) spark = SparkSession(sparkContext) ff = self._flat_file_task() naaccr_text_lines = spark.read.text(str(ff.flat_file)) data = self._data(spark, naaccr_text_lines) # ISSUE: task_id is kinda long; how about just task_hash? # luigi_task_hash? data = data.withColumn('task_id', func.lit(self.task_id)) data = td.case_fold(data) self.account().wr(data.write, self.table_name, mode='overwrite') def _data(self, spark: SparkSession, naaccr_text_lines: DataFrame) -> DataFrame: raise NotImplementedError('subclass must implement')
class NAACCR_Load(_RunScriptTask): '''Map and load NAACCR patients, tumors / visits, and facts. ''' # flat file attributes dateCaseReportExported = pv.DateParam() npiRegistryId = pv.StrParam() # encounter mapping encounter_ide_source = pv.StrParam(default='*****@*****.**') project_id = pv.StrParam(default='BlueHeron') source_cd = pv.StrParam(default='*****@*****.**') # ISSUE: task_id should depend on dest schema / owner. z_design_id = pv.StrParam(default='nested fields') script_name = 'naaccr_facts_load.sql' script_deid_name = 'i2b2_facts_deid.sql' script = res.read_text(heron_load, script_name) script_deid = res.read_text(heron_load, script_deid_name) @property def classpath(self) -> str: return self.jdbc_driver_jar def _upload_target(self) -> 'UploadTarget': return UploadTarget(self, self.schema, transform_name=self.task_id) def requires(self) -> Dict[str, luigi.Task]: _configure_logging(self.log_dest) ff = NAACCR_FlatFile( dateCaseReportExported=self.dateCaseReportExported, npiRegistryId=self.npiRegistryId) parts = { cls.__name__: cls(db_url=self.db_url, user=self.user, passkey=self.passkey, dateCaseReportExported=self.dateCaseReportExported, npiRegistryId=self.npiRegistryId) for cls in [NAACCR_Patients, NAACCR_Visits, NAACCR_Facts] } return dict(parts, NAACCR_FlatFile=ff) def _flat_file_task(self) -> NAACCR_FlatFile: return cast(NAACCR_FlatFile, self.requires()['NAACCR_FlatFile']) def _patients_task(self) -> NAACCR_Patients: return cast(NAACCR_Patients, self.requires()['NAACCR_Patients']) def run_upload(self, conn: Connection, upload_id: int) -> None: ff = self._flat_file_task() pat = self._patients_task() # ISSUE: split these into separate tasks? for name, script in [(self.script_name, self.script), (self.script_deid_name, self.script_deid)]: self.run_script( conn, name, script, variables=dict(upload_id=str(upload_id), task_id=self.task_id), script_params=dict( upload_id=upload_id, project_id=self.project_id, task_id=self.task_id, source_cd=self.source_cd, download_date=ff.dateCaseReportExported, patient_ide_source=pat.patient_ide_source, encounter_ide_source=self.encounter_ide_source))
class NAACCR_FlatFile(ManualTask): """A NAACCR flat file is determined by the registry, export date, and version. """ naaccrRecordVersion = pv.IntParam(default=180) dateCaseReportExported = pv.DateParam() npiRegistryId = pv.StrParam() testData = pv.BoolParam(default=False, significant=False) flat_file = pv.PathParam(significant=False) record_qty_min = pv.IntParam(significant=False, default=1) def check_version_param(self) -> None: """Only version 18 (180) is currently supported. """ if self.naaccrRecordVersion != 180: raise NotImplementedError() def complete(self) -> bool: with task_action(self, 'complete') as ctx: result = self.complete_action() ctx.add_success_fields(result=result) return result def complete_action(self) -> bool: """Check the first record, assuming all the others have the same export date and registry NPI. """ self.check_version_param() with self.flat_file.open() as records: record0 = records.readline() qty = 1 + sum(1 for _ in records.readlines()) log.info('record qty: %d (> %d? %s)', qty, self.record_qty_min, qty >= self.record_qty_min) vOk = self._checkItem(record0, 'naaccrRecordVersion', str(self.naaccrRecordVersion)) regOk = self._checkItem(record0, 'npiRegistryId', self.npiRegistryId) dtOk = self._checkItem(record0, 'dateCaseReportExported', self.dateCaseReportExported.strftime('%Y%m%d')) if vOk and regOk and dtOk and qty >= self.record_qty_min: return True else: if self.testData: log.warn('ignoring failed FlatFile check') return True return False @classmethod def _checkItem(cls, record: str, naaccrId: str, expected: str) -> bool: ''' >>> npi = '1234567890' >>> record0 = ' ' * 19 + npi >>> NAACCR_FlatFile._checkItem(record0, 'npiRegistryId', npi) True >>> NAACCR_FlatFile._checkItem(record0, 'npiRegistryId', 'XXX') False ''' itemDef = tr_ont.NAACCR1.itemDef(naaccrId) [startColumn, length ] = [int(itemDef.attrib[it]) for it in ['startColumn', 'length']] startColumn -= 1 actual = record[startColumn:startColumn + length] if actual != expected: log.warn('%s: expected %s [%s:%s] = {%s} but found {%s}', cls.__name__, naaccrId, startColumn - 1, startColumn + length, expected, actual) return actual == expected