def positive_result_samples_from_mongo( config: Config, plate_barcodes: Optional[List[str]] = None) -> List[SampleDoc]: """Fetch positive samples from Mongo contained within specified plates. Arguments: config {Config} -- application config specifying database details plate_barcodes {Optional[List[str]]} -- barcodes of plates whose samples we are concerned with Returns: List[Dict[str, str]] -- List of positive samples contained within specified plates """ with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) pipeline = [{"$match": {FIELD_RESULT: {"$eq": POSITIVE_RESULT_VALUE}}}] if plate_barcodes is not None: pipeline.append( {"$match": { FIELD_PLATE_BARCODE: { "$in": plate_barcodes } }}) # type: ignore # this should take everything from the cursor find into RAM memory # (assuming you have enough memory) # should we project to an object that has fewer fields? return list(samples_collection.aggregate(pipeline))
def update_dart(config: Config, start_datetime: datetime, end_datetime: datetime) -> None: try: with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) # get samples from mongo between these time ranges and with updated UUIDs samples = get_samples(samples_collection, start_datetime, end_datetime) if not samples: logger.info("No samples in this time range and with updated UUIDs") return logger.debug(f"{len(samples)} samples to process") _, plate_barcodes = extract_required_cp_info(samples) logger.debug(f"{len(plate_barcodes)} unique plate barcodes") update_dart_fields(config, samples) except Exception as e: logger.error("Error while attempting to migrate all DBs") logger.exception(e)
def filtered_positive_fields_set(config: Config, start_datetime: datetime, end_datetime: datetime) -> bool: """Find if the filtered positive version field has been set on any of samples in date range. This would indicate that the migration has already been run on those samples. Args: config {Config} -- application config specifying database details start_datetime {datetime} -- lower limit of sample creation date end_datetime {datetime} -- upper limit of sample creation date Returns: {bool} -- v0 version set in samples """ with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) num_versioned_samples: int = samples_collection.count_documents({ FIELD_CREATED_AT: { "$gte": start_datetime, "$lt": end_datetime }, FIELD_FILTERED_POSITIVE: { "$exists": True }, }) return num_versioned_samples > 0
def process(run_id: str, config: Config = None) -> List[List[str]]: """Generates cherrypicker test data for processing by Crawler and then processes it via the usual runner. The specification of the plates to be generated should be in Mongo. Each plate will contain an exact number of positive results between 0 and 96 as specified. Up to 200 plates can be generated at a time. Arguments: run_id: str - The ID of the run. If this is not found in Mongo an exception will be thrown. Returns: Metadata about the plates generated, as: [ [ "barcode1", "description1" ], [ "barcode2", "description2" ] ] """ logger.info("Begin generating data.") if config is None: config, _ = get_config() with create_mongo_client(config) as mongo_client: mongo_db = get_mongo_db(config, mongo_client) collection = get_mongo_collection(mongo_db, COLLECTION_CHERRYPICK_TEST_DATA) return process_run(config, collection, run_id)
def mongo_database(mongo_client): config, mongo_client = mongo_client db = get_mongo_db(config, mongo_client) # Ensure any existing data is gone before a test starts mongo_client.drop_database(db) # Create indexes on collections -- this also creates the empty source_plates and samples collections ensure_mongo_collections_indexed(db) yield config, db
def mongo_database(mongo_client): config, mongo_client = mongo_client db = get_mongo_db(config, mongo_client) try: yield config, db # Drop the database after each test to ensure they are independent # A transaction may be more appropriate here, but that means significant # code changes, as 'sessions' need to be passed around. I'm also not # sure what version of mongo is being used in production. finally: mongo_client.drop_database(db)
def _record_source_plate_in_mongo_db( self, session: ClientSession) -> ExportResult: """Find an existing plate in MongoDB or add a new one for the plate in the message.""" try: plate_barcode = self._message.plate_barcode.value lab_id_field = self._message.lab_id session_database = get_mongo_db(self._config, session.client) source_plates_collection = get_mongo_collection( session_database, COLLECTION_SOURCE_PLATES) mongo_plate = source_plates_collection.find_one( filter={FIELD_BARCODE: plate_barcode}, session=session) if mongo_plate is not None: # There was a plate in Mongo DB for this field barcode so check that the lab ID matches then return. self._plate_uuid = mongo_plate[FIELD_LH_SOURCE_PLATE_UUID] if mongo_plate[FIELD_MONGO_LAB_ID] != lab_id_field.value: return ExportResult( success=False, create_plate_errors=[ CreatePlateError( type=ErrorType.ExportingPlateAlreadyExists, origin=RABBITMQ_CREATE_FEEDBACK_ORIGIN_PLATE, description= (f"Plate barcode '{plate_barcode}' already exists " f"with a different lab ID: '{mongo_plate[FIELD_MONGO_LAB_ID]}'" ), field=lab_id_field.name, ) ], ) return ExportResult(success=True, create_plate_errors=[]) # Create a new plate for this message. mongo_plate = create_source_plate_doc(plate_barcode, lab_id_field.value) source_plates_collection.insert_one(mongo_plate, session=session) self._plate_uuid = mongo_plate[FIELD_LH_SOURCE_PLATE_UUID] return ExportResult(success=True, create_plate_errors=[]) except Exception as ex: LOGGER.critical( f"Error accessing MongoDB during export of source plate '{plate_barcode}': {ex}" ) LOGGER.exception(ex) raise TransientRabbitError( f"There was an error updating MongoDB while exporting plate with barcode '{plate_barcode}'." )
def run(sftp: bool, keep_files: bool, add_to_dart: bool, settings_module: str = "", centre_prefix: str = "") -> None: try: start = time.time() config, settings_module = get_config(settings_module) logging.config.dictConfig(config.LOGGING) logger.info("-" * 80) logger.info("START") logger.info(f"Using settings from {settings_module}") # get or create the centres collection and filter down to only those with an SFTP data source centres = get_centres_config(config, CENTRE_DATA_SOURCE_SFTP) with create_mongo_client(config) as client: db = get_mongo_db(config, client) ensure_mongo_collections_indexed(db) if centre_prefix: # We are only interested in processing a single centre centres = list(filter(lambda config: config.get(CENTRE_KEY_PREFIX) == centre_prefix, centres)) else: # We should only include centres that are to be batch processed centres = list(filter(lambda config: config.get(CENTRE_KEY_INCLUDE_IN_SCHEDULED_RUNS, True), centres)) centres_instances = [Centre(config, centre_config) for centre_config in centres] for centre_instance in centres_instances: logger.info("*" * 80) logger.info(f"Processing {centre_instance.centre_config[CENTRE_KEY_NAME]}") try: if sftp: centre_instance.download_csv_files() centre_instance.process_files(add_to_dart) except Exception as e: logger.error(f"Error in centre '{centre_instance.centre_config[CENTRE_KEY_NAME]}'") logger.exception(e) finally: if not keep_files and centre_instance.is_download_dir_walkable: centre_instance.clean_up() # Prioritisation of samples update_priority_samples(db, config, add_to_dart) logger.info(f"Import complete in {round(time.time() - start, 2)}s") logger.info("=" * 80) except Exception as e: logger.exception(e)
def update_mongo(config: Config, updated_at: datetime) -> None: with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) counter = 0 for mysql_sample in mysql_sample_generator( config=config, query= f"SELECT * FROM lighthouse_sample WHERE updated_at > '{updated_at.strftime('%Y-%m-%d %H:%M')}'", ): mlwh_sample_uuid = mysql_sample.get(MLWH_LH_SAMPLE_UUID) if mlwh_sample_uuid is None: continue mongo_sample = samples_collection.find_one_and_update( filter={ FIELD_MONGODB_ID: ObjectId(mysql_sample.get(MLWH_MONGODB_ID)), FIELD_LH_SAMPLE_UUID: { "$ne": mlwh_sample_uuid, }, }, update={ "$set": { FIELD_LH_SAMPLE_UUID: mlwh_sample_uuid, UUID_UPDATED: True, FIELD_UPDATED_AT: datetime.utcnow(), } }, ) if mongo_sample is not None: counter += 1 if counter > 0 and (counter % 5000) == 0: logger.debug(f"{counter = }") logger.debug(f"{counter} samples updated in mongo")
def mongo_samples_by_date(config: Config, start_datetime: datetime, end_datetime: datetime) -> List[SampleDoc]: """Gets all samples from Mongo created before Crawler started setting filtered positive fields Arguments: config {Config} -- application config specifying database details start_datetime {datetime} -- lower limit of sample creation date end_datetime {datetime} -- upper limit of sample creation date Returns: List[Sample] -- List of Mongo samples created before filtered positive Crawler changes """ with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) return list( samples_collection.find({ FIELD_CREATED_AT: { "$gte": start_datetime, "$lt": end_datetime }, }))
def get_centres_config(config: Config, data_source: str = "") -> List[CentreConf]: """Get the centres config from MongoDB. If MongoDB does not contain any centres config, it will become populated with the values in the app config for centres. Arguments: config {Config}: The configuration object for the whole application. data_source {str}: The data source filter to apply to centre configs, or None to apply no filter. Return: List[CentreConf]: A List of CentreConf from MongoDB matching the given data source. """ with create_mongo_client(config) as client: db = get_mongo_db(config, client) centres_collection_exists = collection_exists(db, COLLECTION_CENTRES) centres_collection = get_mongo_collection(db, COLLECTION_CENTRES) if not centres_collection_exists: # Populate the centres collection from the config values create_index(centres_collection, FIELD_CENTRE_NAME, unique=True) populate_mongo_collection(centres_collection, config.CENTRES, FIELD_CENTRE_NAME) # type: ignore # Get the centres collection from MongoDB cursor = centres_collection.find() centres = list(map(lambda x: cast(CentreConf, x), cursor)) if data_source: def test_data_source(centre): try: return centre.get(CENTRE_KEY_DATA_SOURCE).lower() == data_source.lower() except (AttributeError): return False centres = list(filter(test_data_source, centres)) return centres
def migrate_all_dbs(config: Config, s_start_datetime: str = "", s_end_datetime: str = "") -> None: if not config: logger.error("Aborting run: Config required") return if not valid_datetime_string(s_start_datetime): logger.error( "Aborting run: Expected format of Start datetime is YYMMDD_HHmm") return if not valid_datetime_string(s_end_datetime): logger.error( "Aborting run: Expected format of End datetime is YYMMDD_HHmm") return start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT) end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT) if start_datetime > end_datetime: logger.error( "Aborting run: End datetime must be greater than Start datetime") return logger.info( f"Starting DART update process with Start datetime {start_datetime} and End datetime {end_datetime}" ) try: mongo_docs_for_sql = [] # open connection to mongo with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) # 1. get samples from mongo between these time ranges samples = get_samples(samples_collection, start_datetime, end_datetime) if not samples: logger.info("No samples in this time range.") return logger.debug(f"{len(samples)} samples to process") root_sample_ids, plate_barcodes = extract_required_cp_info(samples) logger.debug(f"{len(plate_barcodes)} unique plate barcodes") # 2. of these, find which have been cherry-picked and remove them from the list cp_samples_df = get_cherrypicked_samples(config, list(root_sample_ids), list(plate_barcodes)) if cp_samples_df is None: # we need to check if it is None explicitly raise Exception( "Unable to determine cherry-picked sample - potentially error connecting to MySQL" ) # get the samples between those dates minus the cherry-picked ones if cp_samples_df is not None and not cp_samples_df.empty: # we need a list of cherry-picked samples with their respective plate barcodes cp_samples = cp_samples_df[[ FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE ]].to_numpy().tolist() logger.debug( f"{len(cp_samples)} cherry-picked samples in this timeframe" ) samples = remove_cherrypicked_samples(samples, cp_samples) else: logger.debug("No cherry-picked samples in this timeframe") logger.info( f"{len(samples)} samples between these timestamps and not cherry-picked" ) # 3. add the UUID fields if not present add_sample_uuid_field(samples) # update the samples with source plate UUIDs samples_updated_with_source_plate_uuids(mongo_db, samples) # 4. update samples in mongo updated in either of the above two steps (would expect the same set of samples # from both steps) logger.info("Updating Mongo...") _ = update_mongo_fields(mongo_db, samples) logger.info("Finished updating Mongo") # convert mongo field values into MySQL format for sample in samples: mongo_docs_for_sql.append( map_mongo_sample_to_mysql(sample, copy_date=True)) if (num_sql_docs := len(mongo_docs_for_sql)) > 0: logger.info( f"Updating MLWH database for {num_sql_docs} sample documents") # create connection to the MLWH database with create_mysql_connection(config, False) as mlwh_conn: # 5. update the MLWH (should be an idempotent operation) run_mysql_executemany_query(mlwh_conn, SQL_MLWH_MULTIPLE_INSERT, mongo_docs_for_sql) # 6. add all the plates with non-cherrypicked samples (determined in step 2) to DART, as well as any # positive samples in these plates update_dart_fields(config, samples) else:
def _record_samples_in_mongo_db(self, session: ClientSession) -> ExportResult: message_uuid = self._message.message_uuid.value LOGGER.debug( f"Attempting to insert {self._message.total_samples} " f"samples from message with UUID {message_uuid} into mongo...") try: try: session_database = get_mongo_db(self._config, session.client) samples_collection = get_mongo_collection( session_database, COLLECTION_SAMPLES) result = samples_collection.insert_many( documents=self._mongo_sample_docs, ordered=False, session=session) except BulkWriteError as ex: LOGGER.warning( "BulkWriteError: will now establish whether this was because of duplicate samples." ) duplication_errors = list( filter(lambda x: x["code"] == 11000, ex.details["writeErrors"]) # type: ignore ) if len(duplication_errors) == 0: # There weren't any duplication errors so this is not a problem with the message contents! raise create_plate_errors = [] for duplicate in [x["op"] for x in duplication_errors]: create_plate_errors.append( CreatePlateError( type=ErrorType.ExportingSampleAlreadyExists, origin=RABBITMQ_CREATE_FEEDBACK_ORIGIN_SAMPLE, description= (f"Sample with UUID '{duplicate[FIELD_LH_SAMPLE_UUID]}' was unable to be inserted " "because another sample already exists with " f"Lab ID = '{duplicate[FIELD_MONGO_LAB_ID]}'; " f"Root Sample ID = '{duplicate[FIELD_MONGO_ROOT_SAMPLE_ID]}'; " f"RNA ID = '{duplicate[FIELD_MONGO_RNA_ID]}'; " f"Result = '{duplicate[FIELD_MONGO_RESULT]}'"), sample_uuid=duplicate[FIELD_LH_SAMPLE_UUID], )) return ExportResult(success=False, create_plate_errors=create_plate_errors) except Exception as ex: LOGGER.critical( f"Error accessing MongoDB during export of samples for message UUID '{message_uuid}': {ex}" ) LOGGER.exception(ex) raise TransientRabbitError( f"There was an error updating MongoDB while exporting samples for message UUID '{message_uuid}'." ) self._samples_inserted = len(result.inserted_ids) LOGGER.info(f"{self._samples_inserted} samples inserted into mongo.") return ExportResult(success=True, create_plate_errors=[])
def run(sftp: bool, keep_files: bool, add_to_dart: bool, settings_module: str = "") -> None: try: start = time.time() config, settings_module = get_config(settings_module) logging.config.dictConfig(config.LOGGING) logger.info("-" * 80) logger.info("START") logger.info(f"Using settings from {settings_module}") centres = config.CENTRES with create_mongo_client(config) as client: db = get_mongo_db(config, client) # get or create the centres collection centres_collection = get_mongo_collection(db, COLLECTION_CENTRES) logger.debug( f"Creating index '{FIELD_CENTRE_NAME}' on '{centres_collection.full_name}'" ) centres_collection.create_index(FIELD_CENTRE_NAME, unique=True) populate_collection(centres_collection, centres, FIELD_CENTRE_NAME) # get or create the source plates collection source_plates_collection = get_mongo_collection( db, COLLECTION_SOURCE_PLATES) logger.debug( f"Creating index '{FIELD_BARCODE}' on '{source_plates_collection.full_name}'" ) source_plates_collection.create_index(FIELD_BARCODE, unique=True) logger.debug( f"Creating index '{FIELD_LH_SOURCE_PLATE_UUID}' on '{source_plates_collection.full_name}'" ) source_plates_collection.create_index(FIELD_LH_SOURCE_PLATE_UUID, unique=True) with samples_collection_accessor( db, COLLECTION_SAMPLES) as samples_collection: # Index on plate barcode to make it easier to select based on plate barcode logger.debug( f"Creating index '{FIELD_PLATE_BARCODE}' on '{samples_collection.full_name}'" ) samples_collection.create_index(FIELD_PLATE_BARCODE) # Index on result column to make it easier to select the positives logger.debug( f"Creating index '{FIELD_RESULT}' on '{samples_collection.full_name}'" ) samples_collection.create_index(FIELD_RESULT) # Index on unique combination of columns logger.debug( f"Creating compound index on '{samples_collection.full_name}'" ) # create compound index on 'Root Sample ID', 'RNA ID', 'Result', 'Lab ID' - some # data had the same plate tested at another time so ignore the data if it is exactly # the same samples_collection.create_index( [ (FIELD_ROOT_SAMPLE_ID, pymongo.ASCENDING), (FIELD_RNA_ID, pymongo.ASCENDING), (FIELD_RESULT, pymongo.ASCENDING), (FIELD_LAB_ID, pymongo.ASCENDING), ], unique=True, ) # Index on lh_source_plate_uuid column # Added to make lighthouse API source completion event call query more efficient logger.debug( f"Creating index '{FIELD_LH_SOURCE_PLATE_UUID}' on '{samples_collection.full_name}'" ) samples_collection.create_index(FIELD_LH_SOURCE_PLATE_UUID) centres_instances = [ Centre(config, centre_config) for centre_config in centres ] for centre_instance in centres_instances: logger.info("*" * 80) logger.info( f"Processing {centre_instance.centre_config['name']}") try: if sftp: centre_instance.download_csv_files() centre_instance.process_files(add_to_dart) except Exception as e: logger.error("An exception occured") logger.error( f"Error in centre {centre_instance.centre_config['name']}" ) logger.exception(e) finally: if not keep_files and centre_instance.is_download_dir_walkable: centre_instance.clean_up() logger.info(f"Import complete in {round(time.time() - start, 2)}s") logger.info("=" * 80) except Exception as e: logger.exception(e)
def update_mlwh_with_legacy_samples(config: Config, s_start_datetime: str = "", s_end_datetime: str = "") -> None: if not valid_datetime_string(s_start_datetime): print("Aborting run: Expected format of Start datetime is YYMMDD_HHmm") return if not valid_datetime_string(s_end_datetime): print("Aborting run: Expected format of End datetime is YYMMDD_HHmm") return start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT) end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT) if start_datetime > end_datetime: print("Aborting run: End datetime must be greater than Start datetime") return print( f"Starting MLWH update process with Start datetime {start_datetime} and End datetime {end_datetime}" ) try: mongo_docs_for_sql = [] number_docs_found = 0 # open connection mongo with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) print("Selecting Mongo samples") # this should take everything from the cursor find into RAM memory (assuming you have # enough memory) mongo_docs = list( samples_collection.find({ FIELD_CREATED_AT: { "$gte": start_datetime, "$lte": end_datetime } })) number_docs_found = len(mongo_docs) print( f"{number_docs_found} documents found in the mongo database between these timestamps" ) # convert mongo field values into MySQL format for doc in mongo_docs: mongo_docs_for_sql.append( map_mongo_sample_to_mysql(doc, copy_date=True)) if number_docs_found > 0: print( f"Updating MLWH database for {len(mongo_docs_for_sql)} sample documents" ) # create connection to the MLWH database with create_mysql_connection(config, False) as mlwh_conn: # execute sql query to insert/update timestamps into MLWH run_mysql_executemany_query(mlwh_conn, SQL_MLWH_MULTIPLE_INSERT, mongo_docs_for_sql) else: print( "No documents found for this timestamp range, nothing to insert or update in MLWH" ) except Exception: print_exception()
def test_get_mongo_db(mongo_client): config, mongo_client = mongo_client assert type(get_mongo_db(config, mongo_client)) == Database
def setup_mongo_indexes(config): with create_mongo_client(config) as client: db = get_mongo_db(config, client) ensure_mongo_collections_indexed(db)
def update_mongo_filtered_positive_fields(config: Config, samples: List[SampleDoc], version: str, update_timestamp: datetime) -> bool: """Batch updates sample filtered positive fields in the Mongo database Arguments: config {Config} -- application config specifying database details samples {List[Sample]} -- the list of samples whose filtered positive fields should be updated version {str} -- the filtered positive identifier version used update_timestamp {datetime} -- the timestamp at which the update was performed Returns: bool -- whether the updates completed successfully """ with create_mongo_client(config) as client: mongo_db = get_mongo_db(config, client) samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES) num_samples = len(samples) SAMPLES_PER_QUERY = 15000 samples_index = 0 logger.debug( f"Attempting to update {num_samples} rows in Mongo in batches of {SAMPLES_PER_QUERY}" ) while samples_index < num_samples: logger.debug( f"Updating records between {samples_index} and {samples_index + SAMPLES_PER_QUERY}" ) samples_batch = samples[samples_index:( samples_index + SAMPLES_PER_QUERY)] # noqa: E203 # get ids of those that are filtered positive, and those that aren't filtered_positive_ids = [] filtered_negative_ids = [] for sample in samples_batch: if sample[FIELD_FILTERED_POSITIVE] is True: filtered_positive_ids.append(sample[FIELD_MONGODB_ID]) else: filtered_negative_ids.append(sample[FIELD_MONGODB_ID]) samples_collection.update_many( {FIELD_MONGODB_ID: { "$in": filtered_positive_ids }}, { "$set": { FIELD_FILTERED_POSITIVE: True, FIELD_FILTERED_POSITIVE_VERSION: version, FIELD_FILTERED_POSITIVE_TIMESTAMP: update_timestamp, } }, ) samples_collection.update_many( {FIELD_MONGODB_ID: { "$in": filtered_negative_ids }}, { "$set": { FIELD_FILTERED_POSITIVE: False, FIELD_FILTERED_POSITIVE_VERSION: version, FIELD_FILTERED_POSITIVE_TIMESTAMP: update_timestamp, } }, ) samples_index += SAMPLES_PER_QUERY return True
def run(settings_module: str = "") -> None: config, settings_module = get_config(settings_module) with create_mongo_client(config) as client: db = get_mongo_db(config, client) sample_timestamps_helper.add_timestamps_to_samples(db)
def _mongo_db(self) -> Database: if not hasattr(self, "__mongo_db"): client = create_mongo_client(self._config) self.__mongo_db = get_mongo_db(self._config, client) return self.__mongo_db