def validate_args(config: Config, s_start_datetime: str, s_end_datetime: str, updated_at: str) -> Tuple[datetime, datetime, datetime]: base_msg = "Aborting run: " if not config: msg = f"{base_msg} Config required" logger.error(msg) raise Exception(msg) if not valid_datetime_string(s_start_datetime): msg = f"{base_msg} Expected format of Start datetime is YYMMDD_HHmm" logger.error(msg) raise Exception(msg) if not valid_datetime_string(s_end_datetime): msg = f"{base_msg} Expected format of End datetime is YYMMDD_HHmm" logger.error(msg) raise Exception(msg) if not valid_datetime_string(updated_at): msg = f"{base_msg} Expected format of updated_at datetime is YYMMDD_HHmm" logger.error(msg) raise Exception(msg) start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT) end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT) updated_at_datetime = datetime.strptime(updated_at, MONGO_DATETIME_FORMAT) if start_datetime > end_datetime: msg = f"{base_msg} End datetime must be greater than Start datetime" logger.error(msg) raise Exception(msg) return start_datetime, end_datetime, updated_at_datetime
def run(settings_module: str = "", s_start_datetime: str = "", s_end_datetime: str = "") -> None: """Migrate the existing samples to have the filtered positive values. Arguments: settings_module {str} -- settings module from which to generate the app config """ if not valid_datetime_string(s_start_datetime): logger.error( "Aborting run: Expected format of Start datetime is YYMMDD_HHmm") return if not valid_datetime_string(s_end_datetime): logger.error( "Aborting run: Expected format of End datetime is YYMMDD_HHmm") return start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT) end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT) fields_set_datetime = datetime.strptime(FILTERED_POSITIVE_FIELDS_SET_DATE, "%Y-%m-%d") if start_datetime > end_datetime: logger.error( "Aborting run: End datetime must be greater than Start datetime") return if end_datetime > fields_set_datetime: logger.error( "Aborting run: Date range must be prior to the 17th December") return config, settings_module = get_config(settings_module) logging.config.dictConfig(config.LOGGING) logger.info("-" * 80) logger.info("STARTING FILTERED POSITIVES LEGACY UPDATE") logger.info(f"Time start: {datetime.now()}") start_time = time.time() updated_key = "Updated" time_key = "Time taken" mongo_versions_updated = { FILTERED_POSITIVE_VERSION_0: { updated_key: False, time_key: 0.0 }, FILTERED_POSITIVE_VERSION_1: { updated_key: False, time_key: 0.0 }, FILTERED_POSITIVE_VERSION_2: { updated_key: False, time_key: 0.0 }, } mlwh_versions_updated = { FILTERED_POSITIVE_VERSION_0: { updated_key: False, time_key: 0.0 }, FILTERED_POSITIVE_VERSION_1: { updated_key: False, time_key: 0.0 }, FILTERED_POSITIVE_VERSION_2: { updated_key: False, time_key: 0.0 }, } try: continue_migration = pre_migration_filtered_positive_check( config, start_datetime, end_datetime) if continue_migration: logger.info( f"Selecting legacy samples from Mongo between {start_datetime} and {end_datetime}..." ) samples = mongo_samples_by_date(config, start_datetime, end_datetime) legacy_samples_num = len(samples) logger.info(f"{legacy_samples_num} samples found from Mongo") root_sample_ids, plate_barcodes = extract_required_cp_info(samples) logger.info("Querying for v0 cherrypicked samples from MLWH") # Get v0 cherrypicked samples v0_cp_samples_df = get_cherrypicked_samples_by_date( config, list(root_sample_ids), list(plate_barcodes), "1970-01-01 00:00:01", V0_V1_CUTOFF_TIMESTAMP, ) logger.debug( f"Found {len(v0_cp_samples_df.index)} v0 cherrypicked samples" ) # type: ignore logger.info("Querying for cherrypicked samples from MLWH") # Get v1 cherrypicked samples v1_cp_samples_df = get_cherrypicked_samples_by_date( config, list(root_sample_ids), list(plate_barcodes), V0_V1_CUTOFF_TIMESTAMP, V1_V2_CUTOFF_TIMESTAMP, ) logger.debug( f"Found {len(v1_cp_samples_df.index)} v1 cherrypicked samples" ) # type: ignore logger.info("Splitting samples by version...") samples_by_version = split_mongo_samples_by_version( samples, v0_cp_samples_df, v1_cp_samples_df) update_timestamp = datetime.now() for version, version_samples in samples_by_version.items(): filtered_positive_identifier = filtered_positive_identifier_by_version( version) logger.info(f"Updating {version} filtered positives...") update_filtered_positive_fields( filtered_positive_identifier, version_samples, version, update_timestamp, ) logger.info("Updated filtered positives") logger.info("Updating Mongo") for version, version_samples in samples_by_version.items(): logger.info( f"Updating {version} filtered positives in Mongo, total {len(version_samples)} records..." ) mongo_update_start_time = time.time() mongo_updated = update_mongo_filtered_positive_fields( config, version_samples, version, update_timestamp, ) if mongo_updated: logger.info( f"Finished updating {version} filtered positives in Mongo" ) mongo_update_end_time = time.time() mongo_versions_updated[version][updated_key] = True mongo_versions_updated[version][time_key] = round( mongo_update_end_time - mongo_update_start_time, 2) logger.info( f"Updating {version} filtered positives in MLWH...") mlwh_update_start_time = time.time() mlwh_updated = update_mlwh_filtered_positive_fields_batched( config, version_samples, version, update_timestamp) if mlwh_updated: logger.info( f"Finished updating {version} filtered positives in MLWH" ) mlwh_update_end_time = time.time() mlwh_versions_updated[version][updated_key] = True mlwh_versions_updated[version][time_key] = round( mlwh_update_end_time - mlwh_update_start_time, 2) logger.info("Finished updating databases") else: logger.info("Now exiting migration") except Exception as e: logger.error("---------- Process aborted: ----------") logger.error(f"An exception occurred, at {datetime.now()}") logger.exception(e) raise finally: end_time = time.time() logger.info(f""" ---------- Processing status of filtered positive field migration: ---------- -- Mongo updated with v0 filtered positives: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_0][updated_key]}, \ time taken: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_0][time_key]}s -- Mongo updated with v1 filtered positives: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_1][updated_key]}, \ time taken: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_1][time_key]}s -- Mongo updated with v2 filtered positives: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_2][updated_key]}, \ time taken: \ {mongo_versions_updated[FILTERED_POSITIVE_VERSION_2][time_key]}s -- MLWH updated with v0 filtered positives: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_0][updated_key]}, \ time taken: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_0][time_key]}s -- MLWH updated with v1 filtered positives: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_1][updated_key]}, \ time taken: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_1][time_key]}s -- MLWH updated with v2 filtered positives: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_2][updated_key]}, \ time taken: \ {mlwh_versions_updated[FILTERED_POSITIVE_VERSION_2][time_key]}s """) logger.info(f"Time finished: {datetime.now()}") logger.info(f"Migration complete in {round(end_time - start_time, 2)}s") logger.info("=" * 80)
def migrate_all_dbs(config: Config, s_start_datetime: str = "", s_end_datetime: str = "") -> None: if not config: logger.error("Aborting run: Config required") return if not valid_datetime_string(s_start_datetime): logger.error( "Aborting run: Expected format of Start datetime is YYMMDD_HHmm") return if not valid_datetime_string(s_end_datetime): logger.error( "Aborting run: Expected format of End datetime is YYMMDD_HHmm") return start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT) end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT) if start_datetime > end_datetime: logger.error( "Aborting run: End datetime must be greater than Start datetime") return logger.info( f"Starting DART update process with Start datetime {start_datetime} and End datetime {end_datetime}" ) try: mongo_docs_for_sql = [] # open connection to mongo with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) # 1. get samples from mongo between these time ranges samples = get_samples(samples_collection, start_datetime, end_datetime) if not samples: logger.info("No samples in this time range.") return logger.debug(f"{len(samples)} samples to process") root_sample_ids, plate_barcodes = extract_required_cp_info(samples) logger.debug(f"{len(plate_barcodes)} unique plate barcodes") # 2. of these, find which have been cherry-picked and remove them from the list cp_samples_df = get_cherrypicked_samples(config, list(root_sample_ids), list(plate_barcodes)) if cp_samples_df is None: # we need to check if it is None explicitly raise Exception( "Unable to determine cherry-picked sample - potentially error connecting to MySQL" ) # get the samples between those dates minus the cherry-picked ones if cp_samples_df is not None and not cp_samples_df.empty: # we need a list of cherry-picked samples with their respective plate barcodes cp_samples = cp_samples_df[[ FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE ]].to_numpy().tolist() logger.debug( f"{len(cp_samples)} cherry-picked samples in this timeframe" ) samples = remove_cherrypicked_samples(samples, cp_samples) else: logger.debug("No cherry-picked samples in this timeframe") logger.info( f"{len(samples)} samples between these timestamps and not cherry-picked" ) # 3. add the UUID fields if not present add_sample_uuid_field(samples) # update the samples with source plate UUIDs samples_updated_with_source_plate_uuids(mongo_db, samples) # 4. update samples in mongo updated in either of the above two steps (would expect the same set of samples # from both steps) logger.info("Updating Mongo...") _ = update_mongo_fields(mongo_db, samples) logger.info("Finished updating Mongo") # convert mongo field values into MySQL format for sample in samples: mongo_docs_for_sql.append( map_mongo_sample_to_mysql(sample, copy_date=True)) if (num_sql_docs := len(mongo_docs_for_sql)) > 0: logger.info( f"Updating MLWH database for {num_sql_docs} sample documents") # create connection to the MLWH database with create_mysql_connection(config, False) as mlwh_conn: # 5. update the MLWH (should be an idempotent operation) run_mysql_executemany_query(mlwh_conn, SQL_MLWH_MULTIPLE_INSERT, mongo_docs_for_sql) # 6. add all the plates with non-cherrypicked samples (determined in step 2) to DART, as well as any # positive samples in these plates update_dart_fields(config, samples) else:
def update_mlwh_with_legacy_samples(config: Config, s_start_datetime: str = "", s_end_datetime: str = "") -> None: if not valid_datetime_string(s_start_datetime): print("Aborting run: Expected format of Start datetime is YYMMDD_HHmm") return if not valid_datetime_string(s_end_datetime): print("Aborting run: Expected format of End datetime is YYMMDD_HHmm") return start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT) end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT) if start_datetime > end_datetime: print("Aborting run: End datetime must be greater than Start datetime") return print( f"Starting MLWH update process with Start datetime {start_datetime} and End datetime {end_datetime}" ) try: mongo_docs_for_sql = [] number_docs_found = 0 # open connection mongo with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) print("Selecting Mongo samples") # this should take everything from the cursor find into RAM memory (assuming you have # enough memory) mongo_docs = list( samples_collection.find({ FIELD_CREATED_AT: { "$gte": start_datetime, "$lte": end_datetime } })) number_docs_found = len(mongo_docs) print( f"{number_docs_found} documents found in the mongo database between these timestamps" ) # convert mongo field values into MySQL format for doc in mongo_docs: mongo_docs_for_sql.append( map_mongo_sample_to_mysql(doc, copy_date=True)) if number_docs_found > 0: print( f"Updating MLWH database for {len(mongo_docs_for_sql)} sample documents" ) # create connection to the MLWH database with create_mysql_connection(config, False) as mlwh_conn: # execute sql query to insert/update timestamps into MLWH run_mysql_executemany_query(mlwh_conn, SQL_MLWH_MULTIPLE_INSERT, mongo_docs_for_sql) else: print( "No documents found for this timestamp range, nothing to insert or update in MLWH" ) except Exception: print_exception()
def test_valid_datetime_string(): result1 = valid_datetime_string("") assert result1 is False result2 = valid_datetime_string("201209_0000") assert result2 is True