class MetaTableIndex(DBAccessTask): i2b2meta = StrParam() c_table_name = StrParam() wanted = [['m_applied_path'], ['c_fullname']] def complete(self) -> bool: with self.connection('getting indexes for %s' % self.c_table_name) as lc: inspector = sqla.engine.reflection.Inspector( lc._conn) # type: ignore indexes = inspector.get_indexes(self.c_table_name, schema=self.i2b2meta) indexed = [ix['column_names'] for ix in indexes] log.info('already indexed: %s', indexed) return all(cols in indexed for cols in self.wanted) def run(self) -> None: md = sqla.MetaData() with self.connection('adding indexes to %s' % self.c_table_name) as lc: t = sqla.Table(self.c_table_name, md, autoload=True, autoload_with=lc._conn, schema=self.i2b2meta) for ix_num, ix_cols in enumerate(self.wanted): ix = sqla.schema.Index( "%s_%s_%d" % (t.name[:20], ix_cols[0][:8], ix_num), *[t.c[cn] for cn in ix_cols]) ix.create(lc._conn)
class LoadOntology(DBAccessTask): name = StrParam() prototype = StrParam() filename = StrParam() delimiter = StrParam(default=',') extra_cols = StrParam(default='') rowcount = IntParam(default=1) skip = IntParam(default=None) def requires(self) -> luigi.Task: return SaveOntology(filename=self.filename) def complete(self) -> bool: db = self._dbtarget().engine table = Table(self.name, sqla.MetaData(), Column('c_fullname', sqla.String)) if not table.exists(bind=db): log.info('no such table: %s', self.name) return False with self.connection() as q: actual = q.scalar(sqla.select([func.count(table.c.c_fullname)])) log.info('table %s has %d rows', self.name, actual) return actual >= self.rowcount # type: ignore # sqla def run(self) -> None: with self.input().dictreader(delimiter=self.delimiter, lowercase_fieldnames=True) as data: load(self._dbtarget().engine, data, self.name, self.prototype, skip=self.skip, extra_colnames=self.extra_cols.split(','))
class MigrateRows(DBAccessTask): '''Migrate e.g. from an analyst's ontology to runtime i2b2 metadata. ''' src = StrParam() dest = StrParam() # ListParam would be cleaner, but this avoids jenkins quoting foo. key_cols = StrParam() parallel_degree = IntParam(default=24) sql = """ delete from {dest} dest where exists ( select 1 from {src} src where {key_constraint} ); insert into {dest} select * from {src} """ def complete(self) -> bool: return False def run(self) -> None: key_constraints = [ 'src.{col} = dest.{col}'.format(col=col) for col in self.key_cols.split(',') ] sql = self.sql.format(src=self.src, dest=self.dest, key_constraint=' and '.join(key_constraints)) with self.connection('migrate rows') as work: for st in sql.split(';\n'): work.execute(st) work.execute('commit')
class I2B2ProjectCreate(DBAccessTask): star_schema = StrParam(description='see client.cfg') project_id = StrParam(description='see client.cfg') _meta = None # type: Opt[sqla.MetaData] _upload_table = None # type: Opt[sqla.Table] Column, ty = sqla.Column, sqla.types upload_status_columns = [ Column('upload_id', ty.Numeric(38, 0, asdecimal=False), primary_key=True), Column('upload_label', ty.String(500), nullable=False), Column('user_id', ty.String(100), nullable=False), Column('source_cd', ty.String(50), nullable=False), Column('no_of_record', ty.Numeric(asdecimal=False)), Column('loaded_record', ty.Numeric(asdecimal=False)), Column('deleted_record', ty.Numeric(asdecimal=False)), Column('load_date', ty.DateTime, nullable=False), Column('end_date', ty.DateTime), Column('load_status', ty.String(100)), Column('message', ty.Text), Column('input_file_name', ty.Text), Column('log_file_name', ty.Text), Column('transform_name', ty.String(500)), ] def output(self) -> 'SchemaTarget': return SchemaTarget(self._make_url(self.account), schema_name=self.star_schema, table_eg='patient_dimension', echo=self.echo) def run(self) -> None: raise NotImplementedError('see heron_create.create_deid_datamart etc.') @property def metadata(self) -> sqla.MetaData: if self._meta: return self._meta self._meta = meta = sqla.MetaData(schema=self.star_schema) return meta def table_details(self, lc: LoggedConnection, tables: List[str]) -> sqla.MetaData: i2b2_meta = sqla.MetaData(schema=self.star_schema) i2b2_meta.reflect(only=tables, schema=self.star_schema, bind=self._dbtarget().engine) return i2b2_meta @property def upload_table(self) -> sqla.Table: if self._upload_table is not None: return self._upload_table t = sqla.Table('upload_status', self.metadata, *self.upload_status_columns, schema=self.star_schema) self._upload_table = t return t
class I2PConfig(luigi.Config): datamart_id = StrParam(description='see client.cfg') datamart_name = StrParam(description='see client.cfg') enrollment_months_back = StrParam(description='see client.cfg') i2b2_data_schema = StrParam(description='see client.cfg') i2b2_etl_schema = StrParam(description='see client.cfg') i2b2_meta_schema = StrParam(description='see client.cfg') min_pat_list_date_dd_mon_rrrr = StrParam(description='see client.cfg') min_visit_date_dd_mon_rrrr = StrParam(description='see client.cfg') network_id = StrParam(description='see client.cfg') network_name = StrParam(description='see client.cfg')
class MetaTableResetCounts(DBAccessTask): i2b2meta = StrParam() c_table_name = StrParam() def complete(self) -> bool: return False def run(self) -> None: with self.connection('resetting c_totalnum for %s' % self.c_table_name) as lc: lc.execute(''' update {i2b2meta}.{table_name} set c_totalnum = null '''.strip().format(i2b2meta=self.i2b2meta, table_name=self.c_table_name))
class ETLAccount(luigi.Config): '''Access to connect and run ETL. This account needs read access to source material, write access to the destination i2b2 star schema, and working space for intermediary tables, views, functions, and such. ''' account = StrParam(description='see client.cfg', default='') passkey = StrParam(description='see client.cfg', default='', significant=False) ssh_tunnel = StrParam(description='see client.cfg', default='', significant=False) echo = BoolParam(description='SQLAlchemy echo logging', significant=False)
class MetaCountPatients(_ForEachMetaTable): i2b2star = StrParam() def subTask(self, table_cd: str, info: pd.Series) -> luigi.Task: return MetaTableCountPatients(i2b2star=self.i2b2star, i2b2meta=self.i2b2meta, c_table_cd=table_cd)
class MigratePendingUploads(DBAccessTask, I2B2Task, luigi.WrapperTask): workspace_star = StrParam() find_pending = """ select upload_id from %(WORKSPACE)s.upload_status where load_status in ('OK', 'OK_work') and upload_id not in ( select upload_id from %(I2B2STAR)s.upload_status where load_status='OK' ) """ def requires(self) -> Iterator[luigi.Task]: find_pending = self.find_pending % dict( WORKSPACE=self.workspace_star, I2B2STAR=self.project.star_schema) with self.connection('pending uploads') as lc: pending = [row.upload_id for row in lc.execute(find_pending).fetchall()] workmeta = sqla.MetaData() for upload_id in pending: table = Table('observation_fact_%d' % upload_id, workmeta, schema=self.workspace_star) if table.exists(bind=lc._conn): yield MigrateUpload(upload_id=upload_id, workspace_star=self.workspace_star) else: log.warn('no such table to migrate: %s', table)
class FillTableFromView(_HarvestRefresh): '''Fill (insert into) PCORNet CDM table from a view of I2B2 data. Use HARVEST refresh columns to track completion status. ''' script = cast(Script, luigi.EnumParameter( enum=Script, description='script to build view')) view = StrParam(description='Transformation view') parallel_degree = IntParam(default=6, significant=False) pat_group_qty = IntParam(default=6, significant=False) def requires(self) -> List[luigi.Task]: return [ self.project, # I2B2 project SqlScriptTask(script=self.script, param_vars=self.variables), HarvestInit.script_with_vars(self.variables) ] bulk_insert = '''insert /*+ append parallel({degree}) */ into {ps}.{table} select * from {view}''' def load(self, work: LoggedConnection) -> None: step = self.bulk_insert.format(table=self.table, view=self.view, ps=self.harvest.schema, degree=self.parallel_degree) log_plan(work, 'fill chunk of {table}'.format(table=self.table), {}, sql=step) work.execute(step)
class CMSExtract(SourceTask, DBAccessTask): download_date = TimeStampParam(description='see client.cfg') cms_rif = StrParam(description='see client.cfg') script_variable = 'cms_source_cd' # ISSUE: separate source_cd for yr1-3 vs 4-5? see MappingReset source_cd = "'ccwdata.org'" table_eg = 'pde' def _dbtarget(self) -> DBTarget: return SchemaTarget(self._make_url(self.account), schema_name=self.cms_rif, table_eg=self.table_eg, echo=self.echo) def table_details(self, lc: LoggedConnection, tables: List[str]) -> MetaData: rif_meta = MetaData(schema=self.cms_rif) rif_meta.reflect(only=tables, schema=self.cms_rif, bind=self._dbtarget().engine) return rif_meta def run(self) -> None: raise NotImplementedError( 'cannot find %s.%s. CMS Extract is built elsewhere.' % (self.cms_rif, self.table_eg))
class AdHoc(DBAccessTask): sql = StrParam() name = StrParam() def _csvout(self) -> CSVTarget: return CSVTarget(path=self.name + '.csv') def output(self) -> luigi.Target: return self._csvout() def run(self) -> None: with self.connection() as work: result = work.execute(self.sql) cols = result.keys() rows = result.fetchall() self._csvout().export(cols, rows)
class SaveOntology(luigi.Task): filename = StrParam() def output(self) -> luigi.Target: return CSVTarget(path=self.filename) def requires(self) -> List[luigi.Target]: return []
class HarvestInit(SqlScriptTask): '''Create HARVEST table with one row. ''' script = Script.cdm_harvest_init schema = StrParam(description='PCORNet CDM schema name', default='CMS_PCORNET_CDM') @property def variables(self) -> Environment: return dict(PCORNET_CDM=self.schema)
class _ForEachMetaTable(DBAccessTask, luigi.WrapperTask): i2b2meta = StrParam() def requires(self) -> List[luigi.Task]: with self.connection('finding metadata tables') as lc: each = topFolders(self.i2b2meta, lc) return [ self.subTask(table_cd, info) for table_cd, info in each.iterrows() ] def subTask(self, table_cd: str, info: pd.Series) -> luigi.Task: raise NotImplementedError
class MigrateUpload(SqlScriptTask, I2B2Task): upload_id = IntParam() workspace_star = StrParam() parallel_degree = IntParam(default=24, significant=False) script = Script.migrate_fact_upload @property def variables(self) -> Environment: return dict(I2B2STAR=self.project.star_schema, workspace_star=self.workspace_star, parallel_degree=str(self.parallel_degree), upload_id=str(self.upload_id))
class MetaToConcepts(UploadTask): script = Script.concept_dimension_fill ont_table_name = StrParam( description="table to scan for c_tablename = 'concept_dimension' records") @property def i2b2meta(self) -> str: raise NotImplementedError('subclass must implement') @property def variables(self) -> Environment: return dict(I2B2STAR=self.project.star_schema, I2B2META=self.i2b2meta, ONT_TABLE_NAME=self.ont_table_name)
class ExploreSchema(DBAccessTask): schema_name = StrParam() def _csvout(self) -> CSVTarget: return CSVTarget(path=self.schema_name + '.csv') def output(self) -> luigi.Target: return self._csvout() def run(self) -> None: conn = self._dbtarget().engine.connect() with conn.begin(): info = ColumnInfo.from_owner(conn, self.schema_name) self._csvout().export(list(ColumnInfo._fields), info)
class KillSessions(DBAccessTask): reason = StrParam(default='*no reason given*') sql = ''' begin sys.kill_own_sessions(:reason); end; ''' def complete(self) -> bool: return False def run(self) -> None: with self.connection('kill own sessions') as work: work.execute(self.sql, params=dict(reason=self.reason))
class NPIDownloadConfig(luigi.Config): # The configured 'path' and 'npi' variables are used by the downloadNPI method to fetch and # store the NPPES zip file. Changes to these may require changes to the file system. # TODO: Update code to discover the npi_csv automatically. dl_path = StrParam(description='Path where the NPPES zip file will be stored and unzipped.') extract_path = StrParam(description='Path where the extract') npi_csv = StrParam(description='CSV file in the NPPES zip that contains NPI data.') npi_url = StrParam(description='URL for the NPPES download site.') npi_zip = StrParam(description='Name of the NPPES zip file.') # The configured 'col' and 'ct' variables reflect the layout of the NPI data file. # The extracNPI method uses these values to parse the NPI data file. # Changes to these may require code changes. # Complete overkill making these configurable. Consider reverting to hard coded values. taxonomy_col = StrParam(description='Header for the taxonomy columns in the NPI data file.') switch_col = StrParam(description='Header for the switch columns in the NPI data file.') npi_col = StrParam(description='Header for the NPI column in the NPI data file.') taxonomy_ct = IntParam(description='Number of taxonomy columns in the NPI data file.')
class _HarvestRefresh(DBAccessTask, I2B2Task): table = StrParam(description='PCORNet CDM data table name') # The PCORNet CDM HARVEST table has a refresh column for each # of the data tables -- 14 of them as of version 3.1. complete_test = 'select refresh_{table}_date from {ps}.harvest' @property def harvest(self) -> HarvestInit: return HarvestInit() def complete(self) -> bool: deps = luigi.task.flatten(self.requires()) # type: List[luigi.Task] if not all(t.complete() for t in deps): return False table = self.table schema = self.harvest.schema with self.connection('{0} fresh?'.format(table)) as work: refreshed_at = work.scalar(self.complete_test.format( ps=schema, table=table)) return refreshed_at is not None @property def variables(self) -> Environment: return dict(I2B2STAR=self.project.star_schema, PCORNET_CDM=self.harvest.schema) steps = [ 'delete from {ps}.{table}', # ISSUE: lack of truncate privilege is a pain. 'commit', None, "update {ps}.harvest set refresh_{table}_date = sysdate, datamart_claims = (select present from harvest_enum)" ] def run(self) -> None: with self.connection('refresh {table}'.format(table=self.table)) as work: for step in self.steps: if step is None: self.load(work) else: step = step.format(table=self.table, ps=self.harvest.schema) work.execute(step) def load(self, work: LoggedConnection) -> None: raise NotImplementedError('abstract')
class LoadCSV(CDMStatusTask): ''' Creates a table in the db with the name taskName. The table is loaded with data from the csv file specifed by csvname. Table creation is logged in the cdm status table. ''' csvname = StrParam() def run(self) -> None: self.setTaskStart() self.load() self.setTaskEnd(self.getRecordCountFromTable()) def load(self) -> None: def sz(l: int, chunk: int = 16) -> int: return max(chunk, chunk * ((l + chunk - 1) // chunk)) db = self._dbtarget().engine schema = MetaData() l = list() with open(self.csvname) as fin: # ISSUE: ambient dr = DictReader(fin) Dict # for tools that don't see type: comments. mcl = defaultdict(int) # type: Dict[str, int] # Iterate data in the input csv, find the largest data item # and set the column size to the item size rounded to the nearest chunk. for row in dr: l.append(row) for col in dr.fieldnames: mcl[col] = sz(max(mcl[col], len(row[col]))) columns = ([Column(n, String(mcl[n])) for n in dr.fieldnames]) table = Table(self.taskName, schema, *columns) if table.exists(bind=db): table.drop(db) table.create(db) db.execute(table.insert(), l)
class CMSExtract(SourceTask, DBAccessTask): download_date = TimeStampParam(description='see client.cfg') cms_rif = StrParam(description='see client.cfg') script_variable = 'cms_source_cd' source_cd = "'ccwdata.org'" table_eg = 'mbsf_ab_summary' def _dbtarget(self) -> DBTarget: return SchemaTarget(self._make_url(self.account), schema_name=self.cms_rif, table_eg=self.table_eg, echo=self.echo) def table_details(self, lc: LoggedConnection, tables: List[str]) -> MetaData: rif_meta = MetaData(schema=self.cms_rif) rif_meta.reflect(only=tables, schema=self.cms_rif, bind=self._dbtarget().engine) return rif_meta
class TerminologyDumpImport(SourceTask, DBAccessTask): ''' - Shrine Terminology Files - Oracle Reeder Apr 10 Reeder at UTSouthwestern Mon Apr 10 10:39:05 CDT 2017 http://listserv.kumc.edu/pipermail/gpc-dev/2017q2/003805.html - basic SNOW ontology alignment: Demographics, Diagnoses GPC:ticket:525#comment:12 Apr 17 https://informatics.gpcnetwork.org/trac/Project/ticket/525#comment:12 - PCORIMETADATA_3_24_2017.dmp ''' # March 24 is really last modified date. # Jun 6 09:17 is when I downloaded it and imported it. download_date = cast(datetime, TimeStampParameter(default=datetime(2017, 3, 24))) source_cd = "'PCORNET_CDM'" pcorimetadata = StrParam(default='PCORIMETADATA') table_eg = 'pcornet_demo' def _dbtarget(self) -> DBTarget: return SchemaTarget(self._make_url(self.account), schema_name=self.pcorimetadata, table_eg=self.table_eg, echo=self.echo)
class MetaTableCountPatients(DBAccessTask): i2b2star = StrParam() i2b2meta = StrParam() c_table_cd = StrParam() cell_size_threshold = IntParam(default=11) sentinel = -1 def complete(self) -> bool: with self.connection('any c_totalnum needed?') as lc: return len(self.todo(lc)) == 0 def todo(self, lc: LoggedConnection) -> pd.DataFrame: desc = self.activeDescendants(lc) return desc[desc.c_totalnum.isnull()] def activeDescendants(self, lc: LoggedConnection) -> pd.DataFrame: top = self.top(lc) desc = read_sql_step( ''' select c_fullname, c_hlevel, c_visualattributes, c_totalnum, c_name, c_tooltip from {i2b2meta}.{meta_table} meta where meta.c_hlevel > :c_hlevel and meta.c_fullname like (:c_fullname || '%') and upper(meta.c_visualattributes) like '_A%' and c_synonym_cd = 'N' and m_applied_path = '@' order by meta.c_hlevel, upper(meta.c_name) '''.format(i2b2meta=self.i2b2meta, meta_table=top.c_table_name).strip(), lc=lc, params=dict(c_fullname=top.c_fullname, c_hlevel=int(top.c_hlevel))).set_index('c_fullname') return desc def top(self, lc: LoggedConnection) -> pd.Series: return read_sql_step( ''' select c_table_cd, c_hlevel, c_visualattributes, c_name , upper(c_table_name) c_table_name, c_fullname from {i2b2meta}.table_access ta where upper(ta.c_visualattributes) like '_A%' and ta.c_table_cd = :c_table_cd '''.format(i2b2meta=self.i2b2meta).strip(), lc, dict(c_table_cd=self.c_table_cd)).set_index('c_table_cd').iloc[0] def conceptPatientCount( self, top: pd.DataFrame, c_fullname: str, lc: LoggedConnection, # Moderate degree to support work on several tables in parallel. parallel_degree: int = 8 ) -> int: counts = read_sql_step( ''' select /*+ parallel({degree}) */ c_fullname, c_hlevel, c_visualattributes, c_name , case when upper(meta.c_visualattributes) like 'C%' then :sentinel * 1 when lower(meta.c_tablename) <> 'concept_dimension' or lower(meta.c_operator) <> 'like' or lower(meta.c_facttablecolumn) <> 'concept_cd' then :sentinel * 2 else coalesce(( select count(distinct obs.patient_num) from ( select concept_cd from {i2b2star}.concept_dimension where concept_path like (meta.c_dimcode || '%') ) cd join {i2b2star}.observation_fact obs on obs.concept_cd = cd.concept_cd), :sentinel * 3) end c_totalnum from {i2b2meta}.{table_name} meta where c_synonym_cd = 'N' and meta.c_fullname = :c_fullname '''.strip().format(i2b2star=self.i2b2star, i2b2meta=self.i2b2meta, degree=parallel_degree, table_name=top.c_table_name), lc=lc, params=dict(c_fullname=c_fullname, sentinel=self.sentinel)).set_index('c_fullname') [count] = counts.c_totalnum.values return int(count) def run(self) -> None: with self.connection('update patient counts in %s' % self.c_table_cd) as lc: top = self.top(lc) for c_fullname, concept in self.todo(lc).iterrows(): count = self.conceptPatientCount(top, c_fullname, lc) if 0 <= count < self.cell_size_threshold: count = self.sentinel * 5 lc.execute(''' update {i2b2meta}.{table_name} set c_totalnum = :total where c_fullname = :c_fullname '''.strip().format(i2b2meta=self.i2b2meta, table_name=top.c_table_name), params=dict(c_fullname=c_fullname, total=count)) lc.execute('commit')
class CDMStatusTask(DBAccessTask): ''' A DBAccessTask that relies on the CDM status table to assess completion. Typical usage is to record the start of the task, run task operations and then record the end of the task. self.setTaskStart() self.load() self.setTaskEnd(self.getRecordCountFromTable()) ''' taskName = StrParam() # Basic status check, assume the typical task produces at least one record. expectedRecords = IntParam(default=1) statusTable = Table("cdm_status", MetaData(), Column('TASK'), Column('START_TIME'), Column('END_TIME'), Column('RECORDS')) def complete(self) -> bool: ''' Complete when the CDM status table reports at least as many records as expected for the task. ''' with self.connection() as q: statusTableRecordCount = q.scalar( 'select records from cdm_status where task = \'%s\'' % self.taskName) # If true, the task has not been logged in the CDM status table or has been logged and is in an # inconsistent state with the number of records set to null. if statusTableRecordCount is None: return False log.info('task %s has %d rows', self.taskName, statusTableRecordCount) return statusTableRecordCount >= self.expectedRecords # type: ignore # sqla def getRecordCountFromTable(self) -> int: ''' Queries the database for the number of records in the table named taskName ''' # This op is out of sync with the rest of the class, in that it assumes # the task must represent the creation of a table in the db. with self.connection() as q: return q.scalar( sqla.select([func.count() ]).select_from(self.taskName)) # type: ignore def setTaskEnd(self, rowCount: int) -> None: ''' Updates the taskName entry in the CDM status table with an end time of now and a count of records. ''' st = self.statusTable db = self._dbtarget().engine db.execute(st.update().where(st.c.TASK == self.taskName), [{ 'END_TIME': datetime.now(), 'RECORDS': rowCount }]) def setTaskStart(self) -> None: ''' Adds taskName to the CDM status table with a start time of now. ''' st = self.statusTable db = self._dbtarget().engine # prune any failed attempt db.execute(st.delete().where( sqla.and_(st.c.TASK == self.taskName, st.c.END_TIME == None))) # noqa db.execute(st.insert(), [{ 'TASK': self.taskName, 'START_TIME': datetime.now() }])
class I2B2ProjectCreate(DBAccessTask): star_schema = StrParam(description='see client.cfg') project_id = StrParam(description='see client.cfg') _meta = None # type: Opt[sqla.MetaData] _upload_table = None # type: Opt[sqla.Table] Column, ty = sqla.Column, sqla.types upload_status_columns = [ Column('upload_id', ty.Numeric(38, 0, asdecimal=False), primary_key=True), Column('upload_label', ty.String(500), nullable=False), Column('user_id', ty.String(100), nullable=False), Column('source_cd', ty.String(50), nullable=False), Column('no_of_record', ty.Numeric(asdecimal=False)), Column('loaded_record', ty.Numeric(asdecimal=False)), Column('deleted_record', ty.Numeric(asdecimal=False)), Column('load_date', ty.DateTime, nullable=False), Column('end_date', ty.DateTime), Column('load_status', ty.String(100)), Column('message', ty.Text), Column('input_file_name', ty.Text), Column('log_file_name', ty.Text), Column('transform_name', ty.String(500)), ] def output(self) -> 'SchemaTarget': return SchemaTarget(self._make_url(self.account), schema_name=self.star_schema, table_eg='patient_dimension', echo=self.echo) def run(self) -> None: raise NotImplementedError('see heron_create.create_deid_datamart etc.') @property def metadata(self) -> sqla.MetaData: if self._meta: return self._meta self._meta = meta = sqla.MetaData(schema=self.star_schema) return meta def table_details(self, lc: LoggedConnection, tables: List[str]) -> sqla.MetaData: i2b2_meta = sqla.MetaData(schema=self.star_schema) i2b2_meta.reflect(only=tables, schema=self.star_schema, bind=self._dbtarget().engine) return i2b2_meta @property def upload_table(self) -> sqla.Table: if self._upload_table is not None: return self._upload_table t = sqla.Table( 'upload_status', self.metadata, *self.upload_status_columns, schema=self.star_schema) self._upload_table = t return t pat_grp_q = ''' select :group_qty grp_qty, group_num , min(patient_num) patient_num_lo , max(patient_num) patient_num_hi from ( select patient_num , ntile(:group_qty) over (order by patient_num) as group_num from ( select /*+ parallel(20) */ distinct patient_num from {i2b2_star}.patient_dimension where patient_num is not null ) ea ) w_ntile group by group_num, :group_qty order by group_num ''' def patient_groups(self, q: LoggedConnection, qty: int) -> List[RowProxy]: groups = q.execute(self.pat_grp_q.format(i2b2_star=self.star_schema), params=dict(group_qty=qty)).fetchall() q.log.info('groups: %s', groups) return groups
class DBAccessTask(luigi.Task): """Manage DB account credentials and logging. Typical usage:: with self.connection(event='munching data') as conn: howmany = conn.scalar('select count(*) from cookies') yummy = conn.execute('select * from cookies') """ account = StrParam(default=ETLAccount().account) ssh_tunnel = StrParam(default=ETLAccount().ssh_tunnel, significant=False) passkey = StrParam(default=ETLAccount().passkey, significant=False) echo = BoolParam(default=ETLAccount().echo, significant=False) max_idle = IntParam(description='Set to less than Oracle profile max idle time.', default=60 * 20, significant=False) _log = logging.getLogger(__name__) # ISSUE: ambient. def output(self) -> luigi.Target: return self._dbtarget() def _dbtarget(self) -> DBTarget: return DBTarget(self._make_url(self.account), target_table=None, update_id=self.task_id, echo=self.echo) def _make_url(self, account: str) -> str: url = make_url(account) if 'oracle' in account.lower(): url.query['pool_recycle'] = self.max_idle # `twophase` interferes with direct path load somehow. url.query['allow_twophase'] = False if self.passkey: from os import environ # ISSUE: ambient url.password = environ[self.passkey] if self.ssh_tunnel and url.host: host, port = self.ssh_tunnel.split(':', 1) url.host = host url.port = port return str(url) def log_info(self) -> Dict[str, Any]: '''Get info to log: luigi params, task_family, task_hash. ''' return dict(self.to_str_params(only_significant=True), task_family=self.task_family, task_hash=self.task_id[-luigi.task.TASK_ID_TRUNCATE_HASH:]) @contextmanager def connection(self, event: str='connect') -> Iterator[LoggedConnection]: log = EventLogger(self._log, self.log_info()) with log.step('%(event)s: <%(account)s>', dict(event=event, account=self.account)) as step: conn = ConnectionProblem.tryConnect(self._dbtarget().engine) try: yield LoggedConnection(conn, log, step) finally: conn.close() def _fix_password(self, environ: Dict[str, str], getpass: Callable[[str], str]) -> None: '''for interactive use; e.g. in notebooks ''' if self.passkey not in environ: environ[self.passkey] = getpass(self.passkey)
class FillTableFromView(DBAccessTask, I2B2Task): '''Fill (insert into) PCORNet CDM table from a view of I2B2 data. Use HARVEST refresh columns to track completion status. ''' table = StrParam(description='PCORNet CDM data table name') script = cast( Script, luigi.EnumParameter(enum=Script, description='script to build view')) view = StrParam(description='Transformation view') parallel_degree = IntParam(default=6, significant=False) pat_group_qty = IntParam(default=6, significant=False) # The PCORNet CDM HARVEST table has a refresh column for each # of the data tables -- 14 of them as of version 3.1. complete_test = 'select refresh_{table}_date from {ps}.harvest' @property def harvest(self) -> HarvestInit: return HarvestInit() def requires(self) -> List[luigi.Task]: return [ self.project, # I2B2 project SqlScriptTask(script=self.script, param_vars=self.variables), SqlScriptTask(script=Script.cdm_harvest_init, param_vars=self.variables) ] @property def variables(self) -> Environment: return dict(I2B2STAR=self.project.star_schema, PCORNET_CDM=self.harvest.schema) def complete(self) -> bool: deps = luigi.task.flatten(self.requires()) # type: List[luigi.Task] if not all(t.complete() for t in deps): return False table = self.table schema = self.harvest.schema with self.connection('{0} fresh?'.format(table)) as work: refreshed_at = work.scalar( self.complete_test.format(ps=schema, table=table)) return refreshed_at is not None steps = [ 'delete from {ps}.{table}', # ISSUE: lack of truncate privilege is a pain. 'commit', '''insert /*+ append parallel({parallel_degree}) */ into {ps}.{table} select * from {view} where patid between :lo and :hi''', "update {ps}.harvest set refresh_{table}_date = sysdate, datamart_claims = (select present from harvest_enum)" ] def run(self) -> None: with self.connection( 'refresh {table}'.format(table=self.table)) as work: groups = self.project.patient_groups(work, self.pat_group_qty) for step in self.steps: step = step.format(table=self.table, view=self.view, ps=self.harvest.schema, parallel_degree=self.parallel_degree) if insert_append_table(step): log_plan(work, 'fill chunk of {table}'.format(table=self.table), {}, sql=step) for (qty, num, lo, hi) in groups: work.execute(step, params=dict(lo=lo, hi=hi)) else: work.execute(step)