Esempio n. 1
0
class MetaTableIndex(DBAccessTask):
    i2b2meta = StrParam()
    c_table_name = StrParam()

    wanted = [['m_applied_path'], ['c_fullname']]

    def complete(self) -> bool:
        with self.connection('getting indexes for %s' %
                             self.c_table_name) as lc:
            inspector = sqla.engine.reflection.Inspector(
                lc._conn)  # type: ignore
            indexes = inspector.get_indexes(self.c_table_name,
                                            schema=self.i2b2meta)

        indexed = [ix['column_names'] for ix in indexes]
        log.info('already indexed: %s', indexed)
        return all(cols in indexed for cols in self.wanted)

    def run(self) -> None:
        md = sqla.MetaData()
        with self.connection('adding indexes to %s' % self.c_table_name) as lc:
            t = sqla.Table(self.c_table_name,
                           md,
                           autoload=True,
                           autoload_with=lc._conn,
                           schema=self.i2b2meta)
            for ix_num, ix_cols in enumerate(self.wanted):
                ix = sqla.schema.Index(
                    "%s_%s_%d" % (t.name[:20], ix_cols[0][:8], ix_num),
                    *[t.c[cn] for cn in ix_cols])
                ix.create(lc._conn)
Esempio n. 2
0
class LoadOntology(DBAccessTask):
    name = StrParam()
    prototype = StrParam()
    filename = StrParam()
    delimiter = StrParam(default=',')
    extra_cols = StrParam(default='')
    rowcount = IntParam(default=1)
    skip = IntParam(default=None)

    def requires(self) -> luigi.Task:
        return SaveOntology(filename=self.filename)

    def complete(self) -> bool:
        db = self._dbtarget().engine
        table = Table(self.name, sqla.MetaData(),
                      Column('c_fullname', sqla.String))
        if not table.exists(bind=db):
            log.info('no such table: %s', self.name)
            return False
        with self.connection() as q:
            actual = q.scalar(sqla.select([func.count(table.c.c_fullname)]))
            log.info('table %s has %d rows', self.name, actual)
            return actual >= self.rowcount  # type: ignore  # sqla

    def run(self) -> None:
        with self.input().dictreader(delimiter=self.delimiter,
                                     lowercase_fieldnames=True) as data:
            load(self._dbtarget().engine,
                 data,
                 self.name,
                 self.prototype,
                 skip=self.skip,
                 extra_colnames=self.extra_cols.split(','))
Esempio n. 3
0
class MigrateRows(DBAccessTask):
    '''Migrate e.g. from an analyst's ontology to runtime i2b2 metadata.
    '''
    src = StrParam()
    dest = StrParam()
    # ListParam would be cleaner, but this avoids jenkins quoting foo.
    key_cols = StrParam()
    parallel_degree = IntParam(default=24)

    sql = """
        delete from {dest} dest
        where exists (
          select 1
          from {src} src
          where {key_constraint}
        );
        insert into {dest}
        select * from {src}
        """

    def complete(self) -> bool:
        return False

    def run(self) -> None:
        key_constraints = [
            'src.{col} = dest.{col}'.format(col=col)
            for col in self.key_cols.split(',')
        ]
        sql = self.sql.format(src=self.src,
                              dest=self.dest,
                              key_constraint=' and '.join(key_constraints))
        with self.connection('migrate rows') as work:
            for st in sql.split(';\n'):
                work.execute(st)
            work.execute('commit')
Esempio n. 4
0
class I2B2ProjectCreate(DBAccessTask):
    star_schema = StrParam(description='see client.cfg')
    project_id = StrParam(description='see client.cfg')
    _meta = None  # type: Opt[sqla.MetaData]
    _upload_table = None  # type: Opt[sqla.Table]

    Column, ty = sqla.Column, sqla.types
    upload_status_columns = [
        Column('upload_id',
               ty.Numeric(38, 0, asdecimal=False),
               primary_key=True),
        Column('upload_label', ty.String(500), nullable=False),
        Column('user_id', ty.String(100), nullable=False),
        Column('source_cd', ty.String(50), nullable=False),
        Column('no_of_record', ty.Numeric(asdecimal=False)),
        Column('loaded_record', ty.Numeric(asdecimal=False)),
        Column('deleted_record', ty.Numeric(asdecimal=False)),
        Column('load_date', ty.DateTime, nullable=False),
        Column('end_date', ty.DateTime),
        Column('load_status', ty.String(100)),
        Column('message', ty.Text),
        Column('input_file_name', ty.Text),
        Column('log_file_name', ty.Text),
        Column('transform_name', ty.String(500)),
    ]

    def output(self) -> 'SchemaTarget':
        return SchemaTarget(self._make_url(self.account),
                            schema_name=self.star_schema,
                            table_eg='patient_dimension',
                            echo=self.echo)

    def run(self) -> None:
        raise NotImplementedError('see heron_create.create_deid_datamart etc.')

    @property
    def metadata(self) -> sqla.MetaData:
        if self._meta:
            return self._meta
        self._meta = meta = sqla.MetaData(schema=self.star_schema)
        return meta

    def table_details(self, lc: LoggedConnection,
                      tables: List[str]) -> sqla.MetaData:
        i2b2_meta = sqla.MetaData(schema=self.star_schema)
        i2b2_meta.reflect(only=tables,
                          schema=self.star_schema,
                          bind=self._dbtarget().engine)
        return i2b2_meta

    @property
    def upload_table(self) -> sqla.Table:
        if self._upload_table is not None:
            return self._upload_table
        t = sqla.Table('upload_status',
                       self.metadata,
                       *self.upload_status_columns,
                       schema=self.star_schema)
        self._upload_table = t
        return t
Esempio n. 5
0
class I2PConfig(luigi.Config):
    datamart_id = StrParam(description='see client.cfg')
    datamart_name = StrParam(description='see client.cfg')
    enrollment_months_back = StrParam(description='see client.cfg')
    i2b2_data_schema = StrParam(description='see client.cfg')
    i2b2_etl_schema = StrParam(description='see client.cfg')
    i2b2_meta_schema = StrParam(description='see client.cfg')
    min_pat_list_date_dd_mon_rrrr = StrParam(description='see client.cfg')
    min_visit_date_dd_mon_rrrr = StrParam(description='see client.cfg')
    network_id = StrParam(description='see client.cfg')
    network_name = StrParam(description='see client.cfg')
Esempio n. 6
0
class MetaTableResetCounts(DBAccessTask):
    i2b2meta = StrParam()
    c_table_name = StrParam()

    def complete(self) -> bool:
        return False

    def run(self) -> None:
        with self.connection('resetting c_totalnum for %s' %
                             self.c_table_name) as lc:
            lc.execute('''
                update {i2b2meta}.{table_name} set c_totalnum = null
                '''.strip().format(i2b2meta=self.i2b2meta,
                                   table_name=self.c_table_name))
Esempio n. 7
0
class ETLAccount(luigi.Config):
    '''Access to connect and run ETL.

    This account needs read access to source material, write access to
    the destination i2b2 star schema, and working space for
    intermediary tables, views, functions, and such.
    '''
    account = StrParam(description='see client.cfg', default='')
    passkey = StrParam(description='see client.cfg',
                       default='',
                       significant=False)
    ssh_tunnel = StrParam(description='see client.cfg',
                          default='',
                          significant=False)
    echo = BoolParam(description='SQLAlchemy echo logging', significant=False)
Esempio n. 8
0
class MetaCountPatients(_ForEachMetaTable):
    i2b2star = StrParam()

    def subTask(self, table_cd: str, info: pd.Series) -> luigi.Task:
        return MetaTableCountPatients(i2b2star=self.i2b2star,
                                      i2b2meta=self.i2b2meta,
                                      c_table_cd=table_cd)
Esempio n. 9
0
class MigratePendingUploads(DBAccessTask, I2B2Task, luigi.WrapperTask):
    workspace_star = StrParam()

    find_pending = """
    select upload_id from %(WORKSPACE)s.upload_status
    where load_status in ('OK', 'OK_work') and upload_id not in (
      select upload_id from %(I2B2STAR)s.upload_status
      where load_status='OK' )
    """

    def requires(self) -> Iterator[luigi.Task]:
        find_pending = self.find_pending % dict(
            WORKSPACE=self.workspace_star,
            I2B2STAR=self.project.star_schema)

        with self.connection('pending uploads') as lc:
            pending = [row.upload_id for row in
                       lc.execute(find_pending).fetchall()]

            workmeta = sqla.MetaData()
            for upload_id in pending:
                table = Table('observation_fact_%d' % upload_id, workmeta,
                              schema=self.workspace_star)
                if table.exists(bind=lc._conn):
                    yield MigrateUpload(upload_id=upload_id,
                                        workspace_star=self.workspace_star)
                else:
                    log.warn('no such table to migrate: %s', table)
Esempio n. 10
0
class FillTableFromView(_HarvestRefresh):
    '''Fill (insert into) PCORNet CDM table from a view of I2B2 data.

    Use HARVEST refresh columns to track completion status.
    '''
    script = cast(Script, luigi.EnumParameter(
        enum=Script, description='script to build view'))
    view = StrParam(description='Transformation view')
    parallel_degree = IntParam(default=6, significant=False)
    pat_group_qty = IntParam(default=6, significant=False)

    def requires(self) -> List[luigi.Task]:
        return [
            self.project,  # I2B2 project
            SqlScriptTask(script=self.script,
                          param_vars=self.variables),
            HarvestInit.script_with_vars(self.variables)
        ]

    bulk_insert = '''insert /*+ append parallel({degree}) */ into {ps}.{table}
           select * from {view}'''

    def load(self, work: LoggedConnection) -> None:
        step = self.bulk_insert.format(table=self.table, view=self.view,
                                       ps=self.harvest.schema,
                                       degree=self.parallel_degree)
        log_plan(work, 'fill chunk of {table}'.format(table=self.table), {},
                 sql=step)
        work.execute(step)
Esempio n. 11
0
class CMSExtract(SourceTask, DBAccessTask):
    download_date = TimeStampParam(description='see client.cfg')
    cms_rif = StrParam(description='see client.cfg')

    script_variable = 'cms_source_cd'

    # ISSUE: separate source_cd for yr1-3 vs 4-5? see MappingReset
    source_cd = "'ccwdata.org'"

    table_eg = 'pde'

    def _dbtarget(self) -> DBTarget:
        return SchemaTarget(self._make_url(self.account),
                            schema_name=self.cms_rif,
                            table_eg=self.table_eg,
                            echo=self.echo)

    def table_details(self, lc: LoggedConnection,
                      tables: List[str]) -> MetaData:
        rif_meta = MetaData(schema=self.cms_rif)
        rif_meta.reflect(only=tables,
                         schema=self.cms_rif,
                         bind=self._dbtarget().engine)
        return rif_meta

    def run(self) -> None:
        raise NotImplementedError(
            'cannot find %s.%s. CMS Extract is built elsewhere.' %
            (self.cms_rif, self.table_eg))
Esempio n. 12
0
class AdHoc(DBAccessTask):
    sql = StrParam()
    name = StrParam()

    def _csvout(self) -> CSVTarget:
        return CSVTarget(path=self.name + '.csv')

    def output(self) -> luigi.Target:
        return self._csvout()

    def run(self) -> None:
        with self.connection() as work:
            result = work.execute(self.sql)
            cols = result.keys()
            rows = result.fetchall()
            self._csvout().export(cols, rows)
Esempio n. 13
0
class SaveOntology(luigi.Task):
    filename = StrParam()

    def output(self) -> luigi.Target:
        return CSVTarget(path=self.filename)

    def requires(self) -> List[luigi.Target]:
        return []
Esempio n. 14
0
class HarvestInit(SqlScriptTask):
    '''Create HARVEST table with one row.
    '''
    script = Script.cdm_harvest_init
    schema = StrParam(description='PCORNet CDM schema name',
                      default='CMS_PCORNET_CDM')

    @property
    def variables(self) -> Environment:
        return dict(PCORNET_CDM=self.schema)
Esempio n. 15
0
class _ForEachMetaTable(DBAccessTask, luigi.WrapperTask):
    i2b2meta = StrParam()

    def requires(self) -> List[luigi.Task]:
        with self.connection('finding metadata tables') as lc:
            each = topFolders(self.i2b2meta, lc)
        return [
            self.subTask(table_cd, info) for table_cd, info in each.iterrows()
        ]

    def subTask(self, table_cd: str, info: pd.Series) -> luigi.Task:
        raise NotImplementedError
Esempio n. 16
0
class MigrateUpload(SqlScriptTask, I2B2Task):
    upload_id = IntParam()
    workspace_star = StrParam()
    parallel_degree = IntParam(default=24, significant=False)

    script = Script.migrate_fact_upload

    @property
    def variables(self) -> Environment:
        return dict(I2B2STAR=self.project.star_schema,
                    workspace_star=self.workspace_star,
                    parallel_degree=str(self.parallel_degree),
                    upload_id=str(self.upload_id))
Esempio n. 17
0
class MetaToConcepts(UploadTask):
    script = Script.concept_dimension_fill
    ont_table_name = StrParam(
        description="table to scan for c_tablename = 'concept_dimension' records")

    @property
    def i2b2meta(self) -> str:
        raise NotImplementedError('subclass must implement')

    @property
    def variables(self) -> Environment:
        return dict(I2B2STAR=self.project.star_schema,
                    I2B2META=self.i2b2meta,
                    ONT_TABLE_NAME=self.ont_table_name)
Esempio n. 18
0
class ExploreSchema(DBAccessTask):
    schema_name = StrParam()

    def _csvout(self) -> CSVTarget:
        return CSVTarget(path=self.schema_name + '.csv')

    def output(self) -> luigi.Target:
        return self._csvout()

    def run(self) -> None:
        conn = self._dbtarget().engine.connect()
        with conn.begin():
            info = ColumnInfo.from_owner(conn, self.schema_name)
            self._csvout().export(list(ColumnInfo._fields), info)
Esempio n. 19
0
class KillSessions(DBAccessTask):
    reason = StrParam(default='*no reason given*')
    sql = '''
    begin
      sys.kill_own_sessions(:reason);
    end;
    '''

    def complete(self) -> bool:
        return False

    def run(self) -> None:
        with self.connection('kill own sessions') as work:
            work.execute(self.sql, params=dict(reason=self.reason))
Esempio n. 20
0
class NPIDownloadConfig(luigi.Config):
    # The configured 'path' and 'npi' variables are used by the downloadNPI method to fetch and
    # store the NPPES zip file.  Changes to these may require changes to the file system.
    # TODO: Update code to discover the npi_csv automatically.
    dl_path = StrParam(description='Path where the NPPES zip file will be stored and unzipped.')
    extract_path = StrParam(description='Path where the extract')
    npi_csv = StrParam(description='CSV file in the NPPES zip that contains NPI data.')
    npi_url = StrParam(description='URL for the NPPES download site.')
    npi_zip = StrParam(description='Name of the NPPES zip file.')

    # The configured 'col' and 'ct' variables reflect the layout of the NPI data file.
    # The extracNPI method uses these values to parse the NPI data file.
    # Changes to these may require code changes.
    # Complete overkill making these configurable.  Consider reverting to hard coded values.
    taxonomy_col = StrParam(description='Header for the taxonomy columns in the NPI data file.')
    switch_col = StrParam(description='Header for the switch columns in the NPI data file.')
    npi_col = StrParam(description='Header for the NPI column in the NPI data file.')
    taxonomy_ct = IntParam(description='Number of taxonomy columns in the NPI data file.')
Esempio n. 21
0
class _HarvestRefresh(DBAccessTask, I2B2Task):
    table = StrParam(description='PCORNet CDM data table name')

    # The PCORNet CDM HARVEST table has a refresh column for each
    # of the data tables -- 14 of them as of version 3.1.
    complete_test = 'select refresh_{table}_date from {ps}.harvest'

    @property
    def harvest(self) -> HarvestInit:
        return HarvestInit()

    def complete(self) -> bool:
        deps = luigi.task.flatten(self.requires())  # type: List[luigi.Task]
        if not all(t.complete() for t in deps):
            return False

        table = self.table
        schema = self.harvest.schema
        with self.connection('{0} fresh?'.format(table)) as work:
            refreshed_at = work.scalar(self.complete_test.format(
                ps=schema, table=table))
        return refreshed_at is not None

    @property
    def variables(self) -> Environment:
        return dict(I2B2STAR=self.project.star_schema,
                    PCORNET_CDM=self.harvest.schema)

    steps = [
        'delete from {ps}.{table}',  # ISSUE: lack of truncate privilege is a pain.
        'commit',
        None,
        "update {ps}.harvest set refresh_{table}_date = sysdate, datamart_claims = (select present from harvest_enum)"
    ]

    def run(self) -> None:
        with self.connection('refresh {table}'.format(table=self.table)) as work:
            for step in self.steps:
                if step is None:
                    self.load(work)
                else:
                    step = step.format(table=self.table, ps=self.harvest.schema)
                    work.execute(step)

    def load(self, work: LoggedConnection) -> None:
        raise NotImplementedError('abstract')
Esempio n. 22
0
class LoadCSV(CDMStatusTask):
    '''
    Creates a table in the db with the name taskName.

    The table is loaded with data from the csv file specifed by csvname.
    Table creation is logged in the cdm status table.
    '''
    csvname = StrParam()

    def run(self) -> None:
        self.setTaskStart()
        self.load()
        self.setTaskEnd(self.getRecordCountFromTable())

    def load(self) -> None:
        def sz(l: int, chunk: int = 16) -> int:
            return max(chunk, chunk * ((l + chunk - 1) // chunk))

        db = self._dbtarget().engine
        schema = MetaData()
        l = list()

        with open(self.csvname) as fin:  # ISSUE: ambient
            dr = DictReader(fin)

            Dict  # for tools that don't see type: comments.
            mcl = defaultdict(int)  # type: Dict[str, int]
            # Iterate data in the input csv, find the largest data item
            # and set the column size to the item size rounded to the nearest chunk.
            for row in dr:
                l.append(row)
                for col in dr.fieldnames:
                    mcl[col] = sz(max(mcl[col], len(row[col])))

            columns = ([Column(n, String(mcl[n])) for n in dr.fieldnames])
            table = Table(self.taskName, schema, *columns)
            if table.exists(bind=db):
                table.drop(db)
            table.create(db)

            db.execute(table.insert(), l)
Esempio n. 23
0
class CMSExtract(SourceTask, DBAccessTask):
    download_date = TimeStampParam(description='see client.cfg')
    cms_rif = StrParam(description='see client.cfg')

    script_variable = 'cms_source_cd'
    source_cd = "'ccwdata.org'"

    table_eg = 'mbsf_ab_summary'

    def _dbtarget(self) -> DBTarget:
        return SchemaTarget(self._make_url(self.account),
                            schema_name=self.cms_rif,
                            table_eg=self.table_eg,
                            echo=self.echo)

    def table_details(self, lc: LoggedConnection,
                      tables: List[str]) -> MetaData:
        rif_meta = MetaData(schema=self.cms_rif)
        rif_meta.reflect(only=tables,
                         schema=self.cms_rif,
                         bind=self._dbtarget().engine)
        return rif_meta
Esempio n. 24
0
class TerminologyDumpImport(SourceTask, DBAccessTask):
    '''
      - Shrine Terminology Files - Oracle Reeder Apr 10
        Reeder at UTSouthwestern
        Mon Apr 10 10:39:05 CDT 2017
        http://listserv.kumc.edu/pipermail/gpc-dev/2017q2/003805.html
      - ​basic SNOW ontology alignment: Demographics, Diagnoses
        GPC:ticket:525#comment:12 Apr 17
        https://informatics.gpcnetwork.org/trac/Project/ticket/525#comment:12
      - PCORIMETADATA_3_24_2017.dmp
    '''
    # March 24 is really last modified date.
    # Jun  6 09:17 is when I downloaded it and imported it.
    download_date = cast(datetime, TimeStampParameter(default=datetime(2017, 3, 24)))
    source_cd = "'PCORNET_CDM'"

    pcorimetadata = StrParam(default='PCORIMETADATA')
    table_eg = 'pcornet_demo'

    def _dbtarget(self) -> DBTarget:
        return SchemaTarget(self._make_url(self.account),
                            schema_name=self.pcorimetadata,
                            table_eg=self.table_eg,
                            echo=self.echo)
Esempio n. 25
0
class MetaTableCountPatients(DBAccessTask):
    i2b2star = StrParam()
    i2b2meta = StrParam()
    c_table_cd = StrParam()

    cell_size_threshold = IntParam(default=11)
    sentinel = -1

    def complete(self) -> bool:
        with self.connection('any c_totalnum needed?') as lc:
            return len(self.todo(lc)) == 0

    def todo(self, lc: LoggedConnection) -> pd.DataFrame:
        desc = self.activeDescendants(lc)
        return desc[desc.c_totalnum.isnull()]

    def activeDescendants(self, lc: LoggedConnection) -> pd.DataFrame:
        top = self.top(lc)
        desc = read_sql_step(
            '''
            select c_fullname, c_hlevel, c_visualattributes, c_totalnum, c_name, c_tooltip
            from {i2b2meta}.{meta_table} meta
            where meta.c_hlevel > :c_hlevel
              and meta.c_fullname like (:c_fullname || '%')
              and upper(meta.c_visualattributes) like '_A%'
              and c_synonym_cd = 'N'
              and m_applied_path = '@'
            order by meta.c_hlevel, upper(meta.c_name)
            '''.format(i2b2meta=self.i2b2meta,
                       meta_table=top.c_table_name).strip(),
            lc=lc,
            params=dict(c_fullname=top.c_fullname,
                        c_hlevel=int(top.c_hlevel))).set_index('c_fullname')
        return desc

    def top(self, lc: LoggedConnection) -> pd.Series:
        return read_sql_step(
            '''
            select c_table_cd, c_hlevel, c_visualattributes, c_name
                 , upper(c_table_name) c_table_name, c_fullname
            from {i2b2meta}.table_access ta
            where upper(ta.c_visualattributes) like '_A%'
              and ta.c_table_cd = :c_table_cd
            '''.format(i2b2meta=self.i2b2meta).strip(), lc,
            dict(c_table_cd=self.c_table_cd)).set_index('c_table_cd').iloc[0]

    def conceptPatientCount(
        self,
        top: pd.DataFrame,
        c_fullname: str,
        lc: LoggedConnection,
        # Moderate degree to support work on several tables in parallel.
        parallel_degree: int = 8
    ) -> int:
        counts = read_sql_step(
            '''
            select /*+ parallel({degree}) */
                   c_fullname, c_hlevel, c_visualattributes, c_name
                 , case
            when upper(meta.c_visualattributes)     like 'C%'
              then :sentinel * 1
            when lower(meta.c_tablename) <> 'concept_dimension'
              or lower(meta.c_operator) <> 'like'
              or lower(meta.c_facttablecolumn) <> 'concept_cd'
              then :sentinel * 2
            else coalesce((
                select count(distinct obs.patient_num)
                from (
                    select concept_cd
                    from {i2b2star}.concept_dimension
                    where concept_path like (meta.c_dimcode || '%')
                    ) cd
                join {i2b2star}.observation_fact obs
                  on obs.concept_cd = cd.concept_cd), :sentinel * 3)
            end c_totalnum
            from {i2b2meta}.{table_name} meta
            where c_synonym_cd = 'N'
              and meta.c_fullname = :c_fullname
            '''.strip().format(i2b2star=self.i2b2star,
                               i2b2meta=self.i2b2meta,
                               degree=parallel_degree,
                               table_name=top.c_table_name),
            lc=lc,
            params=dict(c_fullname=c_fullname,
                        sentinel=self.sentinel)).set_index('c_fullname')
        [count] = counts.c_totalnum.values
        return int(count)

    def run(self) -> None:
        with self.connection('update patient counts in %s' %
                             self.c_table_cd) as lc:
            top = self.top(lc)
            for c_fullname, concept in self.todo(lc).iterrows():
                count = self.conceptPatientCount(top, c_fullname, lc)
                if 0 <= count < self.cell_size_threshold:
                    count = self.sentinel * 5
                lc.execute('''
                    update {i2b2meta}.{table_name}
                    set c_totalnum = :total
                    where c_fullname = :c_fullname
                    '''.strip().format(i2b2meta=self.i2b2meta,
                                       table_name=top.c_table_name),
                           params=dict(c_fullname=c_fullname, total=count))
                lc.execute('commit')
Esempio n. 26
0
class CDMStatusTask(DBAccessTask):
    '''
    A DBAccessTask that relies on the CDM status table to assess completion.

    Typical usage is to record the start of the task, run task operations and then record
    the end of the task.
        self.setTaskStart()
        self.load()
        self.setTaskEnd(self.getRecordCountFromTable())
    '''
    taskName = StrParam()

    # Basic status check, assume the typical task produces at least one record.
    expectedRecords = IntParam(default=1)

    statusTable = Table("cdm_status", MetaData(), Column('TASK'),
                        Column('START_TIME'), Column('END_TIME'),
                        Column('RECORDS'))

    def complete(self) -> bool:
        '''
        Complete when the CDM status table reports at least as many records as expected for the task.
        '''
        with self.connection() as q:
            statusTableRecordCount = q.scalar(
                'select records from cdm_status where task = \'%s\'' %
                self.taskName)

            # If true, the task has not been logged in the CDM status table or has been logged and is in an
            # inconsistent state with the number of records set to null.
            if statusTableRecordCount is None:
                return False

            log.info('task %s has %d rows', self.taskName,
                     statusTableRecordCount)
            return statusTableRecordCount >= self.expectedRecords  # type: ignore  # sqla

    def getRecordCountFromTable(self) -> int:
        '''
        Queries the database for the number of records in the table named taskName
        '''
        # This op is out of sync with the rest of the class, in that it assumes
        # the task must represent the creation of a table in the db.
        with self.connection() as q:
            return q.scalar(
                sqla.select([func.count()
                             ]).select_from(self.taskName))  # type: ignore

    def setTaskEnd(self, rowCount: int) -> None:
        '''
        Updates the taskName entry in the CDM status table with an end time of now and a count of records.
        '''
        st = self.statusTable
        db = self._dbtarget().engine
        db.execute(st.update().where(st.c.TASK == self.taskName),
                   [{
                       'END_TIME': datetime.now(),
                       'RECORDS': rowCount
                   }])

    def setTaskStart(self) -> None:
        '''
        Adds taskName to the CDM status table with a start time of now.
        '''
        st = self.statusTable
        db = self._dbtarget().engine
        # prune any failed attempt
        db.execute(st.delete().where(
            sqla.and_(st.c.TASK == self.taskName,
                      st.c.END_TIME == None)))  # noqa
        db.execute(st.insert(), [{
            'TASK': self.taskName,
            'START_TIME': datetime.now()
        }])
Esempio n. 27
0
class I2B2ProjectCreate(DBAccessTask):
    star_schema = StrParam(description='see client.cfg')
    project_id = StrParam(description='see client.cfg')
    _meta = None          # type: Opt[sqla.MetaData]
    _upload_table = None  # type: Opt[sqla.Table]

    Column, ty = sqla.Column, sqla.types
    upload_status_columns = [
        Column('upload_id', ty.Numeric(38, 0, asdecimal=False), primary_key=True),
        Column('upload_label', ty.String(500), nullable=False),
        Column('user_id', ty.String(100), nullable=False),
        Column('source_cd', ty.String(50), nullable=False),
        Column('no_of_record', ty.Numeric(asdecimal=False)),
        Column('loaded_record', ty.Numeric(asdecimal=False)),
        Column('deleted_record', ty.Numeric(asdecimal=False)),
        Column('load_date', ty.DateTime, nullable=False),
        Column('end_date', ty.DateTime),
        Column('load_status', ty.String(100)),
        Column('message', ty.Text),
        Column('input_file_name', ty.Text),
        Column('log_file_name', ty.Text),
        Column('transform_name', ty.String(500)),
    ]

    def output(self) -> 'SchemaTarget':
        return SchemaTarget(self._make_url(self.account),
                            schema_name=self.star_schema,
                            table_eg='patient_dimension',
                            echo=self.echo)

    def run(self) -> None:
        raise NotImplementedError('see heron_create.create_deid_datamart etc.')

    @property
    def metadata(self) -> sqla.MetaData:
        if self._meta:
            return self._meta
        self._meta = meta = sqla.MetaData(schema=self.star_schema)
        return meta

    def table_details(self, lc: LoggedConnection, tables: List[str]) -> sqla.MetaData:
        i2b2_meta = sqla.MetaData(schema=self.star_schema)
        i2b2_meta.reflect(only=tables, schema=self.star_schema,
                          bind=self._dbtarget().engine)
        return i2b2_meta

    @property
    def upload_table(self) -> sqla.Table:
        if self._upload_table is not None:
            return self._upload_table
        t = sqla.Table(
            'upload_status', self.metadata,
            *self.upload_status_columns,
            schema=self.star_schema)
        self._upload_table = t
        return t

    pat_grp_q = '''
        select :group_qty grp_qty, group_num
             , min(patient_num) patient_num_lo
             , max(patient_num) patient_num_hi
        from (
          select patient_num
               , ntile(:group_qty) over (order by patient_num) as group_num
          from (
            select /*+ parallel(20) */ distinct patient_num from {i2b2_star}.patient_dimension
            where patient_num is not null
          ) ea
        ) w_ntile
        group by group_num, :group_qty
        order by group_num
    '''

    def patient_groups(self, q: LoggedConnection, qty: int) -> List[RowProxy]:
        groups = q.execute(self.pat_grp_q.format(i2b2_star=self.star_schema),
                           params=dict(group_qty=qty)).fetchall()
        q.log.info('groups: %s', groups)
        return groups
Esempio n. 28
0
class DBAccessTask(luigi.Task):
    """Manage DB account credentials and logging.

    Typical usage::

        with self.connection(event='munching data') as conn:
            howmany = conn.scalar('select count(*) from cookies')
            yummy = conn.execute('select * from cookies')
    """
    account = StrParam(default=ETLAccount().account)
    ssh_tunnel = StrParam(default=ETLAccount().ssh_tunnel,
                          significant=False)
    passkey = StrParam(default=ETLAccount().passkey,
                       significant=False)
    echo = BoolParam(default=ETLAccount().echo,
                     significant=False)
    max_idle = IntParam(description='Set to less than Oracle profile max idle time.',
                        default=60 * 20,
                        significant=False)
    _log = logging.getLogger(__name__)  # ISSUE: ambient.

    def output(self) -> luigi.Target:
        return self._dbtarget()

    def _dbtarget(self) -> DBTarget:
        return DBTarget(self._make_url(self.account),
                        target_table=None, update_id=self.task_id,
                        echo=self.echo)

    def _make_url(self, account: str) -> str:
        url = make_url(account)
        if 'oracle' in account.lower():
            url.query['pool_recycle'] = self.max_idle
            # `twophase` interferes with direct path load somehow.
            url.query['allow_twophase'] = False
        if self.passkey:
            from os import environ  # ISSUE: ambient
            url.password = environ[self.passkey]
        if self.ssh_tunnel and url.host:
            host, port = self.ssh_tunnel.split(':', 1)
            url.host = host
            url.port = port
        return str(url)

    def log_info(self) -> Dict[str, Any]:
        '''Get info to log: luigi params, task_family, task_hash.
        '''
        return dict(self.to_str_params(only_significant=True),
                    task_family=self.task_family,
                    task_hash=self.task_id[-luigi.task.TASK_ID_TRUNCATE_HASH:])

    @contextmanager
    def connection(self, event: str='connect') -> Iterator[LoggedConnection]:
        log = EventLogger(self._log, self.log_info())
        with log.step('%(event)s: <%(account)s>',
                      dict(event=event, account=self.account)) as step:
            conn = ConnectionProblem.tryConnect(self._dbtarget().engine)
            try:
                yield LoggedConnection(conn, log, step)
            finally:
                conn.close()

    def _fix_password(self, environ: Dict[str, str], getpass: Callable[[str], str]) -> None:
        '''for interactive use; e.g. in notebooks
        '''
        if self.passkey not in environ:
            environ[self.passkey] = getpass(self.passkey)
Esempio n. 29
0
class FillTableFromView(DBAccessTask, I2B2Task):
    '''Fill (insert into) PCORNet CDM table from a view of I2B2 data.

    Use HARVEST refresh columns to track completion status.
    '''
    table = StrParam(description='PCORNet CDM data table name')
    script = cast(
        Script,
        luigi.EnumParameter(enum=Script, description='script to build view'))
    view = StrParam(description='Transformation view')
    parallel_degree = IntParam(default=6, significant=False)
    pat_group_qty = IntParam(default=6, significant=False)

    # The PCORNet CDM HARVEST table has a refresh column for each
    # of the data tables -- 14 of them as of version 3.1.
    complete_test = 'select refresh_{table}_date from {ps}.harvest'

    @property
    def harvest(self) -> HarvestInit:
        return HarvestInit()

    def requires(self) -> List[luigi.Task]:
        return [
            self.project,  # I2B2 project
            SqlScriptTask(script=self.script, param_vars=self.variables),
            SqlScriptTask(script=Script.cdm_harvest_init,
                          param_vars=self.variables)
        ]

    @property
    def variables(self) -> Environment:
        return dict(I2B2STAR=self.project.star_schema,
                    PCORNET_CDM=self.harvest.schema)

    def complete(self) -> bool:
        deps = luigi.task.flatten(self.requires())  # type: List[luigi.Task]
        if not all(t.complete() for t in deps):
            return False

        table = self.table
        schema = self.harvest.schema
        with self.connection('{0} fresh?'.format(table)) as work:
            refreshed_at = work.scalar(
                self.complete_test.format(ps=schema, table=table))
        return refreshed_at is not None

    steps = [
        'delete from {ps}.{table}',  # ISSUE: lack of truncate privilege is a pain.
        'commit',
        '''insert /*+ append parallel({parallel_degree}) */ into {ps}.{table}
           select * from {view} where patid between :lo and :hi''',
        "update {ps}.harvest set refresh_{table}_date = sysdate, datamart_claims = (select present from harvest_enum)"
    ]

    def run(self) -> None:
        with self.connection(
                'refresh {table}'.format(table=self.table)) as work:
            groups = self.project.patient_groups(work, self.pat_group_qty)
            for step in self.steps:
                step = step.format(table=self.table,
                                   view=self.view,
                                   ps=self.harvest.schema,
                                   parallel_degree=self.parallel_degree)
                if insert_append_table(step):
                    log_plan(work,
                             'fill chunk of {table}'.format(table=self.table),
                             {},
                             sql=step)
                    for (qty, num, lo, hi) in groups:
                        work.execute(step, params=dict(lo=lo, hi=hi))
                else:
                    work.execute(step)