コード例 #1
0
class patient_chunks_survey(SqlScriptTask):
    script = Script.patient_chunks_survey
    patient_chunks = IntParam(default=20)
    patient_chunk_max = IntParam(default=None)

    @property
    def variables(self) -> Environment:
        return dict(chunk_qty=str(self.patient_chunks))

    def run(self) -> None:
        SqlScriptTask.run_bound(self, script_params=dict(chunk_qty=str(self.patient_chunks)))

    def results(self) -> List[RowProxy]:
        with self.connection(event='survey results') as lc:
            q = '''
               select patient_num
                 , patient_num_qty
                 , patient_num_first
                 , patient_num_last
               from patient_chunks
               where chunk_qty = :chunk_qty
                 and (:chunk_max is null or
                      chunk_num <= :chunk_max)
               order by chunk_num
             '''
            Params
            params = dict(chunk_max=self.patient_chunk_max, chunk_qty=self.patient_chunks)  # type: Params

            try:
                return lc.execute(q, params=params).fetchall()
            except DatabaseError:
                return []
コード例 #2
0
class FillTableFromView(_HarvestRefresh):
    '''Fill (insert into) PCORNet CDM table from a view of I2B2 data.

    Use HARVEST refresh columns to track completion status.
    '''
    script = cast(Script, luigi.EnumParameter(
        enum=Script, description='script to build view'))
    view = StrParam(description='Transformation view')
    parallel_degree = IntParam(default=6, significant=False)
    pat_group_qty = IntParam(default=6, significant=False)

    def requires(self) -> List[luigi.Task]:
        return [
            self.project,  # I2B2 project
            SqlScriptTask(script=self.script,
                          param_vars=self.variables),
            HarvestInit.script_with_vars(self.variables)
        ]

    bulk_insert = '''insert /*+ append parallel({degree}) */ into {ps}.{table}
           select * from {view}'''

    def load(self, work: LoggedConnection) -> None:
        step = self.bulk_insert.format(table=self.table, view=self.view,
                                       ps=self.harvest.schema,
                                       degree=self.parallel_degree)
        log_plan(work, 'fill chunk of {table}'.format(table=self.table), {},
                 sql=step)
        work.execute(step)
コード例 #3
0
class LoadOntology(DBAccessTask):
    name = StrParam()
    prototype = StrParam()
    filename = StrParam()
    delimiter = StrParam(default=',')
    extra_cols = StrParam(default='')
    rowcount = IntParam(default=1)
    skip = IntParam(default=None)

    def requires(self) -> luigi.Task:
        return SaveOntology(filename=self.filename)

    def complete(self) -> bool:
        db = self._dbtarget().engine
        table = Table(self.name, sqla.MetaData(),
                      Column('c_fullname', sqla.String))
        if not table.exists(bind=db):
            log.info('no such table: %s', self.name)
            return False
        with self.connection() as q:
            actual = q.scalar(sqla.select([func.count(table.c.c_fullname)]))
            log.info('table %s has %d rows', self.name, actual)
            return actual >= self.rowcount  # type: ignore  # sqla

    def run(self) -> None:
        with self.input().dictreader(delimiter=self.delimiter,
                                     lowercase_fieldnames=True) as data:
            load(self._dbtarget().engine,
                 data,
                 self.name,
                 self.prototype,
                 skip=self.skip,
                 extra_colnames=self.extra_cols.split(','))
コード例 #4
0
ファイル: cms_etl.py プロジェクト: dckc/grouse
class BeneIdSurvey(FromCMS, SqlScriptTask):
    script = Script.bene_chunks_survey
    bene_chunks = IntParam(default=200, description='see client.cfg')
    bene_chunk_max = IntParam(default=None, description='see client.cfg')

    @property
    def variables(self) -> Environment:
        config = [(lib.CMS_RIF, self.source.cms_rif)]
        return dict(config, chunk_qty=str(self.bene_chunks))

    def run(self) -> None:
        SqlScriptTask.run_bound(self,
                                script_params=dict(chunk_qty=self.bene_chunks))

    def results(self) -> List[RowProxy]:
        with self.connection(event='survey results') as lc:
            q = '''
              select chunk_num
                , bene_id_qty
                , bene_id_first
                , bene_id_last
              from bene_chunks
              where chunk_qty = :chunk_qty
                and (:chunk_max is null or
                     chunk_num <= :chunk_max)
              order by chunk_num
            '''
            params = dict(chunk_max=self.bene_chunk_max,
                          chunk_qty=self.bene_chunks)  # type: Params
            Params  # tell flake8 we're using it.
            try:
                return lc.execute(q, params=params).fetchall()
            except DatabaseError:
                return []
コード例 #5
0
class I2PPatientGroupTask(I2PScriptTask):
    patient_num_first = IntParam()
    patient_num_last = IntParam()
    patient_num_qty = IntParam(significant=False, default=-1)
    group_num = IntParam(significant=False, default=-1)
    group_qty = IntParam(significant=False, default=-1)

    def run(self) -> None:
        SqlScriptTask.run_bound(self, script_params=dict(
            patient_num_first=self.patient_num_first, patient_num_last=self.patient_num_last))
コード例 #6
0
ファイル: etl_tasks.py プロジェクト: dckc/grouse
class MigrateUpload(SqlScriptTask, I2B2Task):
    upload_id = IntParam()
    workspace_star = StrParam()
    parallel_degree = IntParam(default=24, significant=False)

    script = Script.migrate_fact_upload

    @property
    def variables(self) -> Environment:
        return dict(I2B2STAR=self.project.star_schema,
                    workspace_star=self.workspace_star,
                    parallel_degree=str(self.parallel_degree),
                    upload_id=str(self.upload_id))
コード例 #7
0
class MigrateRows(DBAccessTask):
    '''Migrate e.g. from an analyst's ontology to runtime i2b2 metadata.
    '''
    src = StrParam()
    dest = StrParam()
    # ListParam would be cleaner, but this avoids jenkins quoting foo.
    key_cols = StrParam()
    parallel_degree = IntParam(default=24)

    sql = """
        delete from {dest} dest
        where exists (
          select 1
          from {src} src
          where {key_constraint}
        );
        insert into {dest}
        select * from {src}
        """

    def complete(self) -> bool:
        return False

    def run(self) -> None:
        key_constraints = [
            'src.{col} = dest.{col}'.format(col=col)
            for col in self.key_cols.split(',')
        ]
        sql = self.sql.format(src=self.src,
                              dest=self.dest,
                              key_constraint=' and '.join(key_constraints))
        with self.connection('migrate rows') as work:
            for st in sql.split(';\n'):
                work.execute(st)
            work.execute('commit')
コード例 #8
0
class MedparMapping(FromCMS, UploadTask):
    script = Script.medpar_encounter_map
    resources = {'encounter_mapping': 1}

    # This should probably be passed down from CSMRIFLoad
    # because that task and all the others in between
    # are a function of this parameter.
    encounter_num_start = IntParam()

    def requires(self) -> List[luigi.Task]:
        reset = MappingReset()
        return UploadTask.requires(self) + [self.source, reset]

    @property
    def variables(self) -> Environment:
        return dict(self.vars_for_deps,
                    encounter_num_start=str(self.encounter_num_start))
コード例 #9
0
class NPIDownloadConfig(luigi.Config):
    # The configured 'path' and 'npi' variables are used by the downloadNPI method to fetch and
    # store the NPPES zip file.  Changes to these may require changes to the file system.
    # TODO: Update code to discover the npi_csv automatically.
    dl_path = StrParam(description='Path where the NPPES zip file will be stored and unzipped.')
    extract_path = StrParam(description='Path where the extract')
    npi_csv = StrParam(description='CSV file in the NPPES zip that contains NPI data.')
    npi_url = StrParam(description='URL for the NPPES download site.')
    npi_zip = StrParam(description='Name of the NPPES zip file.')

    # The configured 'col' and 'ct' variables reflect the layout of the NPI data file.
    # The extracNPI method uses these values to parse the NPI data file.
    # Changes to these may require code changes.
    # Complete overkill making these configurable.  Consider reverting to hard coded values.
    taxonomy_col = StrParam(description='Header for the taxonomy columns in the NPI data file.')
    switch_col = StrParam(description='Header for the switch columns in the NPI data file.')
    npi_col = StrParam(description='Header for the NPI column in the NPI data file.')
    taxonomy_ct = IntParam(description='Number of taxonomy columns in the NPI data file.')
コード例 #10
0
class DBAccessTask(luigi.Task):
    """Manage DB account credentials and logging.

    Typical usage::

        with self.connection(event='munching data') as conn:
            howmany = conn.scalar('select count(*) from cookies')
            yummy = conn.execute('select * from cookies')
    """
    account = StrParam(default=ETLAccount().account)
    ssh_tunnel = StrParam(default=ETLAccount().ssh_tunnel,
                          significant=False)
    passkey = StrParam(default=ETLAccount().passkey,
                       significant=False)
    echo = BoolParam(default=ETLAccount().echo,
                     significant=False)
    max_idle = IntParam(description='Set to less than Oracle profile max idle time.',
                        default=60 * 20,
                        significant=False)
    _log = logging.getLogger(__name__)  # ISSUE: ambient.

    def output(self) -> luigi.Target:
        return self._dbtarget()

    def _dbtarget(self) -> DBTarget:
        return DBTarget(self._make_url(self.account),
                        target_table=None, update_id=self.task_id,
                        echo=self.echo)

    def _make_url(self, account: str) -> str:
        url = make_url(account)
        if 'oracle' in account.lower():
            url.query['pool_recycle'] = self.max_idle
            # `twophase` interferes with direct path load somehow.
            url.query['allow_twophase'] = False
        if self.passkey:
            from os import environ  # ISSUE: ambient
            url.password = environ[self.passkey]
        if self.ssh_tunnel and url.host:
            host, port = self.ssh_tunnel.split(':', 1)
            url.host = host
            url.port = port
        return str(url)

    def log_info(self) -> Dict[str, Any]:
        '''Get info to log: luigi params, task_family, task_hash.
        '''
        return dict(self.to_str_params(only_significant=True),
                    task_family=self.task_family,
                    task_hash=self.task_id[-luigi.task.TASK_ID_TRUNCATE_HASH:])

    @contextmanager
    def connection(self, event: str='connect') -> Iterator[LoggedConnection]:
        log = EventLogger(self._log, self.log_info())
        with log.step('%(event)s: <%(account)s>',
                      dict(event=event, account=self.account)) as step:
            conn = ConnectionProblem.tryConnect(self._dbtarget().engine)
            try:
                yield LoggedConnection(conn, log, step)
            finally:
                conn.close()

    def _fix_password(self, environ: Dict[str, str], getpass: Callable[[str], str]) -> None:
        '''for interactive use; e.g. in notebooks
        '''
        if self.passkey not in environ:
            environ[self.passkey] = getpass(self.passkey)
コード例 #11
0
ファイル: cms_i2p.py プロジェクト: badagarla/grouse
class FillTableFromView(DBAccessTask, I2B2Task):
    '''Fill (insert into) PCORNet CDM table from a view of I2B2 data.

    Use HARVEST refresh columns to track completion status.
    '''
    table = StrParam(description='PCORNet CDM data table name')
    script = cast(
        Script,
        luigi.EnumParameter(enum=Script, description='script to build view'))
    view = StrParam(description='Transformation view')
    parallel_degree = IntParam(default=6, significant=False)
    pat_group_qty = IntParam(default=6, significant=False)

    # The PCORNet CDM HARVEST table has a refresh column for each
    # of the data tables -- 14 of them as of version 3.1.
    complete_test = 'select refresh_{table}_date from {ps}.harvest'

    @property
    def harvest(self) -> HarvestInit:
        return HarvestInit()

    def requires(self) -> List[luigi.Task]:
        return [
            self.project,  # I2B2 project
            SqlScriptTask(script=self.script, param_vars=self.variables),
            SqlScriptTask(script=Script.cdm_harvest_init,
                          param_vars=self.variables)
        ]

    @property
    def variables(self) -> Environment:
        return dict(I2B2STAR=self.project.star_schema,
                    PCORNET_CDM=self.harvest.schema)

    def complete(self) -> bool:
        deps = luigi.task.flatten(self.requires())  # type: List[luigi.Task]
        if not all(t.complete() for t in deps):
            return False

        table = self.table
        schema = self.harvest.schema
        with self.connection('{0} fresh?'.format(table)) as work:
            refreshed_at = work.scalar(
                self.complete_test.format(ps=schema, table=table))
        return refreshed_at is not None

    steps = [
        'delete from {ps}.{table}',  # ISSUE: lack of truncate privilege is a pain.
        'commit',
        '''insert /*+ append parallel({parallel_degree}) */ into {ps}.{table}
           select * from {view} where patid between :lo and :hi''',
        "update {ps}.harvest set refresh_{table}_date = sysdate, datamart_claims = (select present from harvest_enum)"
    ]

    def run(self) -> None:
        with self.connection(
                'refresh {table}'.format(table=self.table)) as work:
            groups = self.project.patient_groups(work, self.pat_group_qty)
            for step in self.steps:
                step = step.format(table=self.table,
                                   view=self.view,
                                   ps=self.harvest.schema,
                                   parallel_degree=self.parallel_degree)
                if insert_append_table(step):
                    log_plan(work,
                             'fill chunk of {table}'.format(table=self.table),
                             {},
                             sql=step)
                    for (qty, num, lo, hi) in groups:
                        work.execute(step, params=dict(lo=lo, hi=hi))
                else:
                    work.execute(step)
コード例 #12
0
class CDMStatusTask(DBAccessTask):
    '''
    A DBAccessTask that relies on the CDM status table to assess completion.

    Typical usage is to record the start of the task, run task operations and then record
    the end of the task.
        self.setTaskStart()
        self.load()
        self.setTaskEnd(self.getRecordCountFromTable())
    '''
    taskName = StrParam()

    # Basic status check, assume the typical task produces at least one record.
    expectedRecords = IntParam(default=1)

    statusTable = Table("cdm_status", MetaData(), Column('TASK'),
                        Column('START_TIME'), Column('END_TIME'),
                        Column('RECORDS'))

    def complete(self) -> bool:
        '''
        Complete when the CDM status table reports at least as many records as expected for the task.
        '''
        with self.connection() as q:
            statusTableRecordCount = q.scalar(
                'select records from cdm_status where task = \'%s\'' %
                self.taskName)

            # If true, the task has not been logged in the CDM status table or has been logged and is in an
            # inconsistent state with the number of records set to null.
            if statusTableRecordCount is None:
                return False

            log.info('task %s has %d rows', self.taskName,
                     statusTableRecordCount)
            return statusTableRecordCount >= self.expectedRecords  # type: ignore  # sqla

    def getRecordCountFromTable(self) -> int:
        '''
        Queries the database for the number of records in the table named taskName
        '''
        # This op is out of sync with the rest of the class, in that it assumes
        # the task must represent the creation of a table in the db.
        with self.connection() as q:
            return q.scalar(
                sqla.select([func.count()
                             ]).select_from(self.taskName))  # type: ignore

    def setTaskEnd(self, rowCount: int) -> None:
        '''
        Updates the taskName entry in the CDM status table with an end time of now and a count of records.
        '''
        st = self.statusTable
        db = self._dbtarget().engine
        db.execute(st.update().where(st.c.TASK == self.taskName),
                   [{
                       'END_TIME': datetime.now(),
                       'RECORDS': rowCount
                   }])

    def setTaskStart(self) -> None:
        '''
        Adds taskName to the CDM status table with a start time of now.
        '''
        st = self.statusTable
        db = self._dbtarget().engine
        # prune any failed attempt
        db.execute(st.delete().where(
            sqla.and_(st.c.TASK == self.taskName,
                      st.c.END_TIME == None)))  # noqa
        db.execute(st.insert(), [{
            'TASK': self.taskName,
            'START_TIME': datetime.now()
        }])
コード例 #13
0
class MetaTableCountPatients(DBAccessTask):
    i2b2star = StrParam()
    i2b2meta = StrParam()
    c_table_cd = StrParam()

    cell_size_threshold = IntParam(default=11)
    sentinel = -1

    def complete(self) -> bool:
        with self.connection('any c_totalnum needed?') as lc:
            return len(self.todo(lc)) == 0

    def todo(self, lc: LoggedConnection) -> pd.DataFrame:
        desc = self.activeDescendants(lc)
        return desc[desc.c_totalnum.isnull()]

    def activeDescendants(self, lc: LoggedConnection) -> pd.DataFrame:
        top = self.top(lc)
        desc = read_sql_step(
            '''
            select c_fullname, c_hlevel, c_visualattributes, c_totalnum, c_name, c_tooltip
            from {i2b2meta}.{meta_table} meta
            where meta.c_hlevel > :c_hlevel
              and meta.c_fullname like (:c_fullname || '%')
              and upper(meta.c_visualattributes) like '_A%'
              and c_synonym_cd = 'N'
              and m_applied_path = '@'
            order by meta.c_hlevel, upper(meta.c_name)
            '''.format(i2b2meta=self.i2b2meta,
                       meta_table=top.c_table_name).strip(),
            lc=lc,
            params=dict(c_fullname=top.c_fullname,
                        c_hlevel=int(top.c_hlevel))).set_index('c_fullname')
        return desc

    def top(self, lc: LoggedConnection) -> pd.Series:
        return read_sql_step(
            '''
            select c_table_cd, c_hlevel, c_visualattributes, c_name
                 , upper(c_table_name) c_table_name, c_fullname
            from {i2b2meta}.table_access ta
            where upper(ta.c_visualattributes) like '_A%'
              and ta.c_table_cd = :c_table_cd
            '''.format(i2b2meta=self.i2b2meta).strip(), lc,
            dict(c_table_cd=self.c_table_cd)).set_index('c_table_cd').iloc[0]

    def conceptPatientCount(
        self,
        top: pd.DataFrame,
        c_fullname: str,
        lc: LoggedConnection,
        # Moderate degree to support work on several tables in parallel.
        parallel_degree: int = 8
    ) -> int:
        counts = read_sql_step(
            '''
            select /*+ parallel({degree}) */
                   c_fullname, c_hlevel, c_visualattributes, c_name
                 , case
            when upper(meta.c_visualattributes)     like 'C%'
              then :sentinel * 1
            when lower(meta.c_tablename) <> 'concept_dimension'
              or lower(meta.c_operator) <> 'like'
              or lower(meta.c_facttablecolumn) <> 'concept_cd'
              then :sentinel * 2
            else coalesce((
                select count(distinct obs.patient_num)
                from (
                    select concept_cd
                    from {i2b2star}.concept_dimension
                    where concept_path like (meta.c_dimcode || '%')
                    ) cd
                join {i2b2star}.observation_fact obs
                  on obs.concept_cd = cd.concept_cd), :sentinel * 3)
            end c_totalnum
            from {i2b2meta}.{table_name} meta
            where c_synonym_cd = 'N'
              and meta.c_fullname = :c_fullname
            '''.strip().format(i2b2star=self.i2b2star,
                               i2b2meta=self.i2b2meta,
                               degree=parallel_degree,
                               table_name=top.c_table_name),
            lc=lc,
            params=dict(c_fullname=c_fullname,
                        sentinel=self.sentinel)).set_index('c_fullname')
        [count] = counts.c_totalnum.values
        return int(count)

    def run(self) -> None:
        with self.connection('update patient counts in %s' %
                             self.c_table_cd) as lc:
            top = self.top(lc)
            for c_fullname, concept in self.todo(lc).iterrows():
                count = self.conceptPatientCount(top, c_fullname, lc)
                if 0 <= count < self.cell_size_threshold:
                    count = self.sentinel * 5
                lc.execute('''
                    update {i2b2meta}.{table_name}
                    set c_totalnum = :total
                    where c_fullname = :c_fullname
                    '''.strip().format(i2b2meta=self.i2b2meta,
                                       table_name=top.c_table_name),
                           params=dict(c_fullname=c_fullname, total=count))
                lc.execute('commit')