def process_nonpatient_table(table: Table, engine: Engine, progargs: Any) -> None: if progargs.rcep: return log.info("Preprocessing non-patient table {}".format(repr(table.name))) pk_col = get_effective_int_pk_col(table) other_pk_col = pk_col if pk_col != CRATE_COL_PK else None if other_pk_col: # table has a primary key already crate_pk_col = Column(CRATE_COL_PK, BigInteger, nullable=True) else: crate_pk_col = make_bigint_autoincrement_column( CRATE_COL_PK, engine.dialect) table.append_column(crate_pk_col) # must be Table-bound, as above add_columns(engine, table, [crate_pk_col]) if not progargs.print: ensure_columns_present(engine, tablename=table.name, column_names=[CRATE_COL_PK]) if other_pk_col: execute( engine, """ UPDATE {tablename} SET {crate_pk} = {rio_pk} WHERE {crate_pk} IS NULL """.format(tablename=table.name, crate_pk=CRATE_COL_PK, rio_pk=other_pk_col)) add_indexes(engine, table, [{ 'index_name': CRATE_IDX_PK, 'column': CRATE_COL_PK, 'unique': True }])
def process_master_patient_table(table: Table, engine: Engine, progargs: Any) -> None: crate_col_nhs_number = Column(CRATE_COL_NHS_NUMBER, BigInteger, nullable=True) table.append_column(crate_col_nhs_number) add_columns(engine, table, [crate_col_nhs_number]) if progargs.rcep: nhscol = RCEP_COL_NHS_NUMBER else: nhscol = RIO_COL_NHS_NUMBER log.info("Table {}: updating column {}".format(repr(table.name), repr(nhscol))) ensure_columns_present(engine, tablename=table.name, column_names=[nhscol]) if not progargs.print: ensure_columns_present(engine, tablename=table.name, column_names=[CRATE_COL_NHS_NUMBER]) execute( engine, """ UPDATE {tablename} SET {nhs_number_int} = CAST({nhscol} AS BIGINT) WHERE {nhs_number_int} IS NULL """.format( tablename=table.name, nhs_number_int=CRATE_COL_NHS_NUMBER, nhscol=nhscol, ))
def process_clindocs_table(table: Table, engine: Engine, progargs: Any) -> None: # For RiO only, not RCEP crate_col_max_docver = Column(CRATE_COL_MAX_DOCVER, Integer, nullable=True) crate_col_last_doc = Column(CRATE_COL_LAST_DOC, Integer, nullable=True) table.append_column(crate_col_max_docver) table.append_column(crate_col_last_doc) add_columns(engine, table, [crate_col_max_docver, crate_col_last_doc]) add_indexes(engine, table, [ { 'index_name': CRATE_IDX_RIONUM_SERIALNUM, 'column': '{rio_number}, SerialNumber'.format( rio_number=CRATE_COL_RIO_NUMBER), }, { 'index_name': CRATE_IDX_MAX_DOCVER, 'column': CRATE_COL_MAX_DOCVER, }, { 'index_name': CRATE_IDX_LAST_DOC, 'column': CRATE_COL_LAST_DOC, }, ]) required_cols = ["SerialNumber", "RevisionID"] if not progargs.print: required_cols.extend( [CRATE_COL_MAX_DOCVER, CRATE_COL_LAST_DOC, CRATE_COL_RIO_NUMBER]) ensure_columns_present(engine, tablename=table.name, column_names=required_cols) # Find the maximum SerialNumber for each note, and store it. # Slow query, even with index. log.info("Clinical documents table {}: updating {}".format( repr(table.name), repr(CRATE_COL_MAX_DOCVER))) execute( engine, """ UPDATE p1 SET p1.{max_docver_col} = subq.max_docver FROM {tablename} p1 JOIN ( SELECT {rio_number}, SerialNumber, MAX(RevisionID) AS max_docver FROM {tablename} p2 GROUP BY {rio_number}, SerialNumber ) subq ON subq.{rio_number} = p1.{rio_number} AND subq.SerialNumber = p1.SerialNumber WHERE p1.{max_docver_col} IS NULL """.format( max_docver_col=CRATE_COL_MAX_DOCVER, tablename=table.name, rio_number=CRATE_COL_RIO_NUMBER, )) # Set a single column accordingly log.info("Clinical documents table {}: updating {}".format( repr(table.name), repr(CRATE_COL_LAST_DOC))) execute( engine, """ UPDATE {tablename} SET {last_doc_col} = CASE WHEN RevisionID = {max_docver_col} THEN 1 ELSE 0 END WHERE {last_doc_col} IS NULL """.format( tablename=table.name, last_doc_col=CRATE_COL_LAST_DOC, max_docver_col=CRATE_COL_MAX_DOCVER, ))
def process_progress_notes(table: Table, engine: Engine, progargs: Any) -> None: crate_col_max_subnum = Column(CRATE_COL_MAX_SUBNUM, Integer, nullable=True) crate_col_last_note = Column(CRATE_COL_LAST_NOTE, Integer, nullable=True) table.append_column(crate_col_max_subnum) table.append_column(crate_col_last_note) add_columns(engine, table, [crate_col_max_subnum, crate_col_last_note]) # We're always in "RiO land", not "RCEP land", for this one. add_indexes(engine, table, [ { # Joint index, for JOIN in UPDATE statement below 'index_name': CRATE_IDX_RIONUM_NOTENUM, 'column': '{rio_number}, NoteNum'.format( rio_number=CRATE_COL_RIO_NUMBER), }, { # Speeds up WHERE below. (Much, much faster for second run.) 'index_name': CRATE_IDX_MAX_SUBNUM, 'column': CRATE_COL_MAX_SUBNUM, }, { # Speeds up WHERE below. (Much, much faster for second run.) 'index_name': CRATE_IDX_LAST_NOTE, 'column': CRATE_COL_LAST_NOTE, }, ]) ensure_columns_present( engine, tablename=table.name, column_names=["NoteNum", "SubNum", "EnteredInError", "EnteredInError"]) if not progargs.print: ensure_columns_present(engine, tablename=table.name, column_names=[ CRATE_COL_MAX_SUBNUM, CRATE_COL_LAST_NOTE, CRATE_COL_RIO_NUMBER ]) # Find the maximum SubNum for each note, and store it. # Slow query, even with index. log.info("Progress notes table {}: updating {}".format( repr(table.name), repr(CRATE_COL_MAX_SUBNUM))) execute( engine, """ UPDATE p1 SET p1.{max_subnum_col} = subq.max_subnum FROM {tablename} p1 JOIN ( SELECT {rio_number}, NoteNum, MAX(SubNum) AS max_subnum FROM {tablename} p2 GROUP BY {rio_number}, NoteNum ) subq ON subq.{rio_number} = p1.{rio_number} AND subq.NoteNum = p1.NoteNum WHERE p1.{max_subnum_col} IS NULL """.format( max_subnum_col=CRATE_COL_MAX_SUBNUM, tablename=table.name, rio_number=CRATE_COL_RIO_NUMBER, )) # Set a single column accordingly log.info("Progress notes table {}: updating {}".format( repr(table.name), repr(CRATE_COL_LAST_NOTE))) execute( engine, """ UPDATE {tablename} SET {last_note_col} = CASE WHEN SubNum = {max_subnum_col} THEN 1 ELSE 0 END WHERE {last_note_col} IS NULL """.format( tablename=table.name, last_note_col=CRATE_COL_LAST_NOTE, max_subnum_col=CRATE_COL_MAX_SUBNUM, )) # Create a view, if we're on an RCEP database if progargs.rcep and progargs.cpft: select_sql = """ SELECT * FROM {tablename} WHERE (EnteredInError <> 1 OR EnteredInError IS NULL) AND {last_note_col} = 1 """.format( tablename=table.name, last_note_col=CRATE_COL_LAST_NOTE, ) create_view(engine, VIEW_RCEP_CPFT_PROGRESS_NOTES_CURRENT, select_sql)
def process_patient_table(table: Table, engine: Engine, progargs: Any) -> None: log.info("Preprocessing patient table: {}".format(repr(table.name))) rio_type = table_is_rio_type(table.name, progargs) if rio_type: pk_col = get_effective_int_pk_col(table) rio_pk = pk_col if pk_col != CRATE_COL_PK else None string_pt_id = get_rio_patient_id_col(table) required_cols = [string_pt_id] else: # RCEP type rio_pk = None required_cols = [RCEP_COL_PATIENT_ID] string_pt_id = RCEP_COL_PATIENT_ID if not progargs.print: required_cols.extend([CRATE_COL_PK, CRATE_COL_RIO_NUMBER]) # ------------------------------------------------------------------------- # Add pk and rio_number columns, if not present # ------------------------------------------------------------------------- if rio_type and rio_pk is not None: crate_pk_col = Column(CRATE_COL_PK, BigInteger, nullable=True) # ... can't do NOT NULL; need to populate it required_cols.append(rio_pk) else: # RCEP type, or no PK in RiO crate_pk_col = make_bigint_autoincrement_column( CRATE_COL_PK, engine.dialect) # ... autopopulates crate_rio_number_col = Column(CRATE_COL_RIO_NUMBER, BigInteger, nullable=True) # ... even if RiO numbers are INT, they come from VARCHAR(15) here, and # that can (aod does) look numeric and overflow an INT. # SQL Server requires Table-bound columns in order to generate DDL: table.append_column(crate_pk_col) table.append_column(crate_rio_number_col) add_columns(engine, table, [crate_pk_col, crate_rio_number_col]) # ------------------------------------------------------------------------- # Update pk and rio_number values, if not NULL # ------------------------------------------------------------------------- ensure_columns_present(engine, tablename=table.name, column_names=required_cols) cast_id_to_int = sql_fragment_cast_to_int(string_pt_id, dialect=engine.dialect) if rio_type and rio_pk: log.info("Table {}: updating columns {} and {}".format( repr(table.name), repr(CRATE_COL_PK), repr(CRATE_COL_RIO_NUMBER))) execute( engine, """ UPDATE {tablename} SET {crate_pk} = {rio_pk}, {crate_rio_number} = {cast_id_to_int} WHERE {crate_pk} IS NULL OR {crate_rio_number} IS NULL """.format( tablename=table.name, crate_pk=CRATE_COL_PK, rio_pk=rio_pk, crate_rio_number=CRATE_COL_RIO_NUMBER, cast_id_to_int=cast_id_to_int, )) else: # RCEP format, or RiO with no PK # crate_pk is autogenerated as an INT IDENTITY field log.info("Table {}: updating column {}".format( repr(table.name), repr(CRATE_COL_RIO_NUMBER))) execute( engine, """ UPDATE {tablename} SET {crate_rio_number} = {cast_id_to_int} WHERE {crate_rio_number} IS NULL """.format( # noqa tablename=table.name, crate_rio_number=CRATE_COL_RIO_NUMBER, cast_id_to_int=cast_id_to_int, )) # ------------------------------------------------------------------------- # Add indexes, if absent # ------------------------------------------------------------------------- # Note that the indexes are unlikely to speed up the WHERE NOT NULL search # above, so it doesn't matter that we add these last. Their use is for # the subsequent CRATE anonymisation table scans. add_indexes(engine, table, [ { 'index_name': CRATE_IDX_PK, 'column': CRATE_COL_PK, 'unique': True, }, { 'index_name': CRATE_IDX_RIONUM, 'column': CRATE_COL_RIO_NUMBER, }, ])