class patient_chunks_survey(SqlScriptTask): script = Script.patient_chunks_survey patient_chunks = IntParam(default=20) patient_chunk_max = IntParam(default=None) @property def variables(self) -> Environment: return dict(chunk_qty=str(self.patient_chunks)) def run(self) -> None: SqlScriptTask.run_bound(self, script_params=dict(chunk_qty=str(self.patient_chunks))) def results(self) -> List[RowProxy]: with self.connection(event='survey results') as lc: q = ''' select patient_num , patient_num_qty , patient_num_first , patient_num_last from patient_chunks where chunk_qty = :chunk_qty and (:chunk_max is null or chunk_num <= :chunk_max) order by chunk_num ''' Params params = dict(chunk_max=self.patient_chunk_max, chunk_qty=self.patient_chunks) # type: Params try: return lc.execute(q, params=params).fetchall() except DatabaseError: return []
class FillTableFromView(_HarvestRefresh): '''Fill (insert into) PCORNet CDM table from a view of I2B2 data. Use HARVEST refresh columns to track completion status. ''' script = cast(Script, luigi.EnumParameter( enum=Script, description='script to build view')) view = StrParam(description='Transformation view') parallel_degree = IntParam(default=6, significant=False) pat_group_qty = IntParam(default=6, significant=False) def requires(self) -> List[luigi.Task]: return [ self.project, # I2B2 project SqlScriptTask(script=self.script, param_vars=self.variables), HarvestInit.script_with_vars(self.variables) ] bulk_insert = '''insert /*+ append parallel({degree}) */ into {ps}.{table} select * from {view}''' def load(self, work: LoggedConnection) -> None: step = self.bulk_insert.format(table=self.table, view=self.view, ps=self.harvest.schema, degree=self.parallel_degree) log_plan(work, 'fill chunk of {table}'.format(table=self.table), {}, sql=step) work.execute(step)
class LoadOntology(DBAccessTask): name = StrParam() prototype = StrParam() filename = StrParam() delimiter = StrParam(default=',') extra_cols = StrParam(default='') rowcount = IntParam(default=1) skip = IntParam(default=None) def requires(self) -> luigi.Task: return SaveOntology(filename=self.filename) def complete(self) -> bool: db = self._dbtarget().engine table = Table(self.name, sqla.MetaData(), Column('c_fullname', sqla.String)) if not table.exists(bind=db): log.info('no such table: %s', self.name) return False with self.connection() as q: actual = q.scalar(sqla.select([func.count(table.c.c_fullname)])) log.info('table %s has %d rows', self.name, actual) return actual >= self.rowcount # type: ignore # sqla def run(self) -> None: with self.input().dictreader(delimiter=self.delimiter, lowercase_fieldnames=True) as data: load(self._dbtarget().engine, data, self.name, self.prototype, skip=self.skip, extra_colnames=self.extra_cols.split(','))
class BeneIdSurvey(FromCMS, SqlScriptTask): script = Script.bene_chunks_survey bene_chunks = IntParam(default=200, description='see client.cfg') bene_chunk_max = IntParam(default=None, description='see client.cfg') @property def variables(self) -> Environment: config = [(lib.CMS_RIF, self.source.cms_rif)] return dict(config, chunk_qty=str(self.bene_chunks)) def run(self) -> None: SqlScriptTask.run_bound(self, script_params=dict(chunk_qty=self.bene_chunks)) def results(self) -> List[RowProxy]: with self.connection(event='survey results') as lc: q = ''' select chunk_num , bene_id_qty , bene_id_first , bene_id_last from bene_chunks where chunk_qty = :chunk_qty and (:chunk_max is null or chunk_num <= :chunk_max) order by chunk_num ''' params = dict(chunk_max=self.bene_chunk_max, chunk_qty=self.bene_chunks) # type: Params Params # tell flake8 we're using it. try: return lc.execute(q, params=params).fetchall() except DatabaseError: return []
class I2PPatientGroupTask(I2PScriptTask): patient_num_first = IntParam() patient_num_last = IntParam() patient_num_qty = IntParam(significant=False, default=-1) group_num = IntParam(significant=False, default=-1) group_qty = IntParam(significant=False, default=-1) def run(self) -> None: SqlScriptTask.run_bound(self, script_params=dict( patient_num_first=self.patient_num_first, patient_num_last=self.patient_num_last))
class MigrateUpload(SqlScriptTask, I2B2Task): upload_id = IntParam() workspace_star = StrParam() parallel_degree = IntParam(default=24, significant=False) script = Script.migrate_fact_upload @property def variables(self) -> Environment: return dict(I2B2STAR=self.project.star_schema, workspace_star=self.workspace_star, parallel_degree=str(self.parallel_degree), upload_id=str(self.upload_id))
class MigrateRows(DBAccessTask): '''Migrate e.g. from an analyst's ontology to runtime i2b2 metadata. ''' src = StrParam() dest = StrParam() # ListParam would be cleaner, but this avoids jenkins quoting foo. key_cols = StrParam() parallel_degree = IntParam(default=24) sql = """ delete from {dest} dest where exists ( select 1 from {src} src where {key_constraint} ); insert into {dest} select * from {src} """ def complete(self) -> bool: return False def run(self) -> None: key_constraints = [ 'src.{col} = dest.{col}'.format(col=col) for col in self.key_cols.split(',') ] sql = self.sql.format(src=self.src, dest=self.dest, key_constraint=' and '.join(key_constraints)) with self.connection('migrate rows') as work: for st in sql.split(';\n'): work.execute(st) work.execute('commit')
class MedparMapping(FromCMS, UploadTask): script = Script.medpar_encounter_map resources = {'encounter_mapping': 1} # This should probably be passed down from CSMRIFLoad # because that task and all the others in between # are a function of this parameter. encounter_num_start = IntParam() def requires(self) -> List[luigi.Task]: reset = MappingReset() return UploadTask.requires(self) + [self.source, reset] @property def variables(self) -> Environment: return dict(self.vars_for_deps, encounter_num_start=str(self.encounter_num_start))
class NPIDownloadConfig(luigi.Config): # The configured 'path' and 'npi' variables are used by the downloadNPI method to fetch and # store the NPPES zip file. Changes to these may require changes to the file system. # TODO: Update code to discover the npi_csv automatically. dl_path = StrParam(description='Path where the NPPES zip file will be stored and unzipped.') extract_path = StrParam(description='Path where the extract') npi_csv = StrParam(description='CSV file in the NPPES zip that contains NPI data.') npi_url = StrParam(description='URL for the NPPES download site.') npi_zip = StrParam(description='Name of the NPPES zip file.') # The configured 'col' and 'ct' variables reflect the layout of the NPI data file. # The extracNPI method uses these values to parse the NPI data file. # Changes to these may require code changes. # Complete overkill making these configurable. Consider reverting to hard coded values. taxonomy_col = StrParam(description='Header for the taxonomy columns in the NPI data file.') switch_col = StrParam(description='Header for the switch columns in the NPI data file.') npi_col = StrParam(description='Header for the NPI column in the NPI data file.') taxonomy_ct = IntParam(description='Number of taxonomy columns in the NPI data file.')
class DBAccessTask(luigi.Task): """Manage DB account credentials and logging. Typical usage:: with self.connection(event='munching data') as conn: howmany = conn.scalar('select count(*) from cookies') yummy = conn.execute('select * from cookies') """ account = StrParam(default=ETLAccount().account) ssh_tunnel = StrParam(default=ETLAccount().ssh_tunnel, significant=False) passkey = StrParam(default=ETLAccount().passkey, significant=False) echo = BoolParam(default=ETLAccount().echo, significant=False) max_idle = IntParam(description='Set to less than Oracle profile max idle time.', default=60 * 20, significant=False) _log = logging.getLogger(__name__) # ISSUE: ambient. def output(self) -> luigi.Target: return self._dbtarget() def _dbtarget(self) -> DBTarget: return DBTarget(self._make_url(self.account), target_table=None, update_id=self.task_id, echo=self.echo) def _make_url(self, account: str) -> str: url = make_url(account) if 'oracle' in account.lower(): url.query['pool_recycle'] = self.max_idle # `twophase` interferes with direct path load somehow. url.query['allow_twophase'] = False if self.passkey: from os import environ # ISSUE: ambient url.password = environ[self.passkey] if self.ssh_tunnel and url.host: host, port = self.ssh_tunnel.split(':', 1) url.host = host url.port = port return str(url) def log_info(self) -> Dict[str, Any]: '''Get info to log: luigi params, task_family, task_hash. ''' return dict(self.to_str_params(only_significant=True), task_family=self.task_family, task_hash=self.task_id[-luigi.task.TASK_ID_TRUNCATE_HASH:]) @contextmanager def connection(self, event: str='connect') -> Iterator[LoggedConnection]: log = EventLogger(self._log, self.log_info()) with log.step('%(event)s: <%(account)s>', dict(event=event, account=self.account)) as step: conn = ConnectionProblem.tryConnect(self._dbtarget().engine) try: yield LoggedConnection(conn, log, step) finally: conn.close() def _fix_password(self, environ: Dict[str, str], getpass: Callable[[str], str]) -> None: '''for interactive use; e.g. in notebooks ''' if self.passkey not in environ: environ[self.passkey] = getpass(self.passkey)
class FillTableFromView(DBAccessTask, I2B2Task): '''Fill (insert into) PCORNet CDM table from a view of I2B2 data. Use HARVEST refresh columns to track completion status. ''' table = StrParam(description='PCORNet CDM data table name') script = cast( Script, luigi.EnumParameter(enum=Script, description='script to build view')) view = StrParam(description='Transformation view') parallel_degree = IntParam(default=6, significant=False) pat_group_qty = IntParam(default=6, significant=False) # The PCORNet CDM HARVEST table has a refresh column for each # of the data tables -- 14 of them as of version 3.1. complete_test = 'select refresh_{table}_date from {ps}.harvest' @property def harvest(self) -> HarvestInit: return HarvestInit() def requires(self) -> List[luigi.Task]: return [ self.project, # I2B2 project SqlScriptTask(script=self.script, param_vars=self.variables), SqlScriptTask(script=Script.cdm_harvest_init, param_vars=self.variables) ] @property def variables(self) -> Environment: return dict(I2B2STAR=self.project.star_schema, PCORNET_CDM=self.harvest.schema) def complete(self) -> bool: deps = luigi.task.flatten(self.requires()) # type: List[luigi.Task] if not all(t.complete() for t in deps): return False table = self.table schema = self.harvest.schema with self.connection('{0} fresh?'.format(table)) as work: refreshed_at = work.scalar( self.complete_test.format(ps=schema, table=table)) return refreshed_at is not None steps = [ 'delete from {ps}.{table}', # ISSUE: lack of truncate privilege is a pain. 'commit', '''insert /*+ append parallel({parallel_degree}) */ into {ps}.{table} select * from {view} where patid between :lo and :hi''', "update {ps}.harvest set refresh_{table}_date = sysdate, datamart_claims = (select present from harvest_enum)" ] def run(self) -> None: with self.connection( 'refresh {table}'.format(table=self.table)) as work: groups = self.project.patient_groups(work, self.pat_group_qty) for step in self.steps: step = step.format(table=self.table, view=self.view, ps=self.harvest.schema, parallel_degree=self.parallel_degree) if insert_append_table(step): log_plan(work, 'fill chunk of {table}'.format(table=self.table), {}, sql=step) for (qty, num, lo, hi) in groups: work.execute(step, params=dict(lo=lo, hi=hi)) else: work.execute(step)
class CDMStatusTask(DBAccessTask): ''' A DBAccessTask that relies on the CDM status table to assess completion. Typical usage is to record the start of the task, run task operations and then record the end of the task. self.setTaskStart() self.load() self.setTaskEnd(self.getRecordCountFromTable()) ''' taskName = StrParam() # Basic status check, assume the typical task produces at least one record. expectedRecords = IntParam(default=1) statusTable = Table("cdm_status", MetaData(), Column('TASK'), Column('START_TIME'), Column('END_TIME'), Column('RECORDS')) def complete(self) -> bool: ''' Complete when the CDM status table reports at least as many records as expected for the task. ''' with self.connection() as q: statusTableRecordCount = q.scalar( 'select records from cdm_status where task = \'%s\'' % self.taskName) # If true, the task has not been logged in the CDM status table or has been logged and is in an # inconsistent state with the number of records set to null. if statusTableRecordCount is None: return False log.info('task %s has %d rows', self.taskName, statusTableRecordCount) return statusTableRecordCount >= self.expectedRecords # type: ignore # sqla def getRecordCountFromTable(self) -> int: ''' Queries the database for the number of records in the table named taskName ''' # This op is out of sync with the rest of the class, in that it assumes # the task must represent the creation of a table in the db. with self.connection() as q: return q.scalar( sqla.select([func.count() ]).select_from(self.taskName)) # type: ignore def setTaskEnd(self, rowCount: int) -> None: ''' Updates the taskName entry in the CDM status table with an end time of now and a count of records. ''' st = self.statusTable db = self._dbtarget().engine db.execute(st.update().where(st.c.TASK == self.taskName), [{ 'END_TIME': datetime.now(), 'RECORDS': rowCount }]) def setTaskStart(self) -> None: ''' Adds taskName to the CDM status table with a start time of now. ''' st = self.statusTable db = self._dbtarget().engine # prune any failed attempt db.execute(st.delete().where( sqla.and_(st.c.TASK == self.taskName, st.c.END_TIME == None))) # noqa db.execute(st.insert(), [{ 'TASK': self.taskName, 'START_TIME': datetime.now() }])
class MetaTableCountPatients(DBAccessTask): i2b2star = StrParam() i2b2meta = StrParam() c_table_cd = StrParam() cell_size_threshold = IntParam(default=11) sentinel = -1 def complete(self) -> bool: with self.connection('any c_totalnum needed?') as lc: return len(self.todo(lc)) == 0 def todo(self, lc: LoggedConnection) -> pd.DataFrame: desc = self.activeDescendants(lc) return desc[desc.c_totalnum.isnull()] def activeDescendants(self, lc: LoggedConnection) -> pd.DataFrame: top = self.top(lc) desc = read_sql_step( ''' select c_fullname, c_hlevel, c_visualattributes, c_totalnum, c_name, c_tooltip from {i2b2meta}.{meta_table} meta where meta.c_hlevel > :c_hlevel and meta.c_fullname like (:c_fullname || '%') and upper(meta.c_visualattributes) like '_A%' and c_synonym_cd = 'N' and m_applied_path = '@' order by meta.c_hlevel, upper(meta.c_name) '''.format(i2b2meta=self.i2b2meta, meta_table=top.c_table_name).strip(), lc=lc, params=dict(c_fullname=top.c_fullname, c_hlevel=int(top.c_hlevel))).set_index('c_fullname') return desc def top(self, lc: LoggedConnection) -> pd.Series: return read_sql_step( ''' select c_table_cd, c_hlevel, c_visualattributes, c_name , upper(c_table_name) c_table_name, c_fullname from {i2b2meta}.table_access ta where upper(ta.c_visualattributes) like '_A%' and ta.c_table_cd = :c_table_cd '''.format(i2b2meta=self.i2b2meta).strip(), lc, dict(c_table_cd=self.c_table_cd)).set_index('c_table_cd').iloc[0] def conceptPatientCount( self, top: pd.DataFrame, c_fullname: str, lc: LoggedConnection, # Moderate degree to support work on several tables in parallel. parallel_degree: int = 8 ) -> int: counts = read_sql_step( ''' select /*+ parallel({degree}) */ c_fullname, c_hlevel, c_visualattributes, c_name , case when upper(meta.c_visualattributes) like 'C%' then :sentinel * 1 when lower(meta.c_tablename) <> 'concept_dimension' or lower(meta.c_operator) <> 'like' or lower(meta.c_facttablecolumn) <> 'concept_cd' then :sentinel * 2 else coalesce(( select count(distinct obs.patient_num) from ( select concept_cd from {i2b2star}.concept_dimension where concept_path like (meta.c_dimcode || '%') ) cd join {i2b2star}.observation_fact obs on obs.concept_cd = cd.concept_cd), :sentinel * 3) end c_totalnum from {i2b2meta}.{table_name} meta where c_synonym_cd = 'N' and meta.c_fullname = :c_fullname '''.strip().format(i2b2star=self.i2b2star, i2b2meta=self.i2b2meta, degree=parallel_degree, table_name=top.c_table_name), lc=lc, params=dict(c_fullname=c_fullname, sentinel=self.sentinel)).set_index('c_fullname') [count] = counts.c_totalnum.values return int(count) def run(self) -> None: with self.connection('update patient counts in %s' % self.c_table_cd) as lc: top = self.top(lc) for c_fullname, concept in self.todo(lc).iterrows(): count = self.conceptPatientCount(top, c_fullname, lc) if 0 <= count < self.cell_size_threshold: count = self.sentinel * 5 lc.execute(''' update {i2b2meta}.{table_name} set c_totalnum = :total where c_fullname = :c_fullname '''.strip().format(i2b2meta=self.i2b2meta, table_name=top.c_table_name), params=dict(c_fullname=c_fullname, total=count)) lc.execute('commit')