def positive_result_samples_from_mongo(
        config: Config,
        plate_barcodes: Optional[List[str]] = None) -> List[SampleDoc]:
    """Fetch positive samples from Mongo contained within specified plates.

    Arguments:
        config {Config} -- application config specifying database details
        plate_barcodes {Optional[List[str]]} -- barcodes of plates whose samples we are concerned with

    Returns:
        List[Dict[str, str]] -- List of positive samples contained within specified plates
    """
    with create_mongo_client(config) as client:
        mongo_db = get_mongo_db(config, client)
        samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES)

        pipeline = [{"$match": {FIELD_RESULT: {"$eq": POSITIVE_RESULT_VALUE}}}]

        if plate_barcodes is not None:
            pipeline.append(
                {"$match": {
                    FIELD_PLATE_BARCODE: {
                        "$in": plate_barcodes
                    }
                }})  # type: ignore

        # this should take everything from the cursor find into RAM memory
        # (assuming you have enough memory)
        # should we project to an object that has fewer fields?
        return list(samples_collection.aggregate(pipeline))
Exemple #2
0
def update_dart(config: Config, start_datetime: datetime,
                end_datetime: datetime) -> None:
    try:
        with create_mongo_client(config) as client:
            mongo_db = get_mongo_db(config, client)

            samples_collection = get_mongo_collection(mongo_db,
                                                      COLLECTION_SAMPLES)

            # get samples from mongo between these time ranges and with updated UUIDs
            samples = get_samples(samples_collection, start_datetime,
                                  end_datetime)

        if not samples:
            logger.info("No samples in this time range and with updated UUIDs")
            return

        logger.debug(f"{len(samples)} samples to process")

        _, plate_barcodes = extract_required_cp_info(samples)

        logger.debug(f"{len(plate_barcodes)} unique plate barcodes")

        update_dart_fields(config, samples)
    except Exception as e:
        logger.error("Error while attempting to migrate all DBs")
        logger.exception(e)
def filtered_positive_fields_set(config: Config, start_datetime: datetime,
                                 end_datetime: datetime) -> bool:
    """Find if the filtered positive version field has been set on any of samples in date range.
       This would indicate that the migration has already been run on those samples.

    Args:
        config {Config} -- application config specifying database details
        start_datetime {datetime} -- lower limit of sample creation date
        end_datetime {datetime} -- upper limit of sample creation date

    Returns:
        {bool} -- v0 version set in samples
    """
    with create_mongo_client(config) as client:
        mongo_db = get_mongo_db(config, client)
        samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES)

        num_versioned_samples: int = samples_collection.count_documents({
            FIELD_CREATED_AT: {
                "$gte": start_datetime,
                "$lt": end_datetime
            },
            FIELD_FILTERED_POSITIVE: {
                "$exists": True
            },
        })

        return num_versioned_samples > 0
Exemple #4
0
def process(run_id: str, config: Config = None) -> List[List[str]]:
    """Generates cherrypicker test data for processing by Crawler and then
    processes it via the usual runner.

    The specification of the plates to be generated should be in Mongo. Each
    plate will contain an exact number of positive results between 0 and 96 as
    specified. Up to 200 plates can be generated at a time.

    Arguments:
        run_id: str - The ID of the run.  If this is not found in Mongo an
            exception will be thrown.

    Returns:
        Metadata about the plates generated, as:
        [ [ "barcode1", "description1" ], [ "barcode2", "description2" ] ]
    """
    logger.info("Begin generating data.")

    if config is None:
        config, _ = get_config()

    with create_mongo_client(config) as mongo_client:
        mongo_db = get_mongo_db(config, mongo_client)
        collection = get_mongo_collection(mongo_db,
                                          COLLECTION_CHERRYPICK_TEST_DATA)

        return process_run(config, collection, run_id)
Exemple #5
0
def mongo_database(mongo_client):
    config, mongo_client = mongo_client
    db = get_mongo_db(config, mongo_client)

    # Ensure any existing data is gone before a test starts
    mongo_client.drop_database(db)

    # Create indexes on collections -- this also creates the empty source_plates and samples collections
    ensure_mongo_collections_indexed(db)

    yield config, db
Exemple #6
0
def mongo_database(mongo_client):
    config, mongo_client = mongo_client
    db = get_mongo_db(config, mongo_client)
    try:
        yield config, db
    # Drop the database after each test to ensure they are independent
    # A transaction may be more appropriate here, but that means significant
    # code changes, as 'sessions' need to be passed around. I'm also not
    # sure what version of mongo is being used in production.
    finally:
        mongo_client.drop_database(db)
    def _record_source_plate_in_mongo_db(
            self, session: ClientSession) -> ExportResult:
        """Find an existing plate in MongoDB or add a new one for the plate in the message."""
        try:
            plate_barcode = self._message.plate_barcode.value
            lab_id_field = self._message.lab_id

            session_database = get_mongo_db(self._config, session.client)
            source_plates_collection = get_mongo_collection(
                session_database, COLLECTION_SOURCE_PLATES)
            mongo_plate = source_plates_collection.find_one(
                filter={FIELD_BARCODE: plate_barcode}, session=session)

            if mongo_plate is not None:
                # There was a plate in Mongo DB for this field barcode so check that the lab ID matches then return.
                self._plate_uuid = mongo_plate[FIELD_LH_SOURCE_PLATE_UUID]

                if mongo_plate[FIELD_MONGO_LAB_ID] != lab_id_field.value:
                    return ExportResult(
                        success=False,
                        create_plate_errors=[
                            CreatePlateError(
                                type=ErrorType.ExportingPlateAlreadyExists,
                                origin=RABBITMQ_CREATE_FEEDBACK_ORIGIN_PLATE,
                                description=
                                (f"Plate barcode '{plate_barcode}' already exists "
                                 f"with a different lab ID: '{mongo_plate[FIELD_MONGO_LAB_ID]}'"
                                 ),
                                field=lab_id_field.name,
                            )
                        ],
                    )

                return ExportResult(success=True, create_plate_errors=[])

            # Create a new plate for this message.
            mongo_plate = create_source_plate_doc(plate_barcode,
                                                  lab_id_field.value)
            source_plates_collection.insert_one(mongo_plate, session=session)
            self._plate_uuid = mongo_plate[FIELD_LH_SOURCE_PLATE_UUID]

            return ExportResult(success=True, create_plate_errors=[])
        except Exception as ex:
            LOGGER.critical(
                f"Error accessing MongoDB during export of source plate '{plate_barcode}': {ex}"
            )
            LOGGER.exception(ex)

            raise TransientRabbitError(
                f"There was an error updating MongoDB while exporting plate with barcode '{plate_barcode}'."
            )
Exemple #8
0
def run(sftp: bool, keep_files: bool, add_to_dart: bool, settings_module: str = "", centre_prefix: str = "") -> None:
    try:
        start = time.time()
        config, settings_module = get_config(settings_module)

        logging.config.dictConfig(config.LOGGING)

        logger.info("-" * 80)
        logger.info("START")
        logger.info(f"Using settings from {settings_module}")

        # get or create the centres collection and filter down to only those with an SFTP data source
        centres = get_centres_config(config, CENTRE_DATA_SOURCE_SFTP)

        with create_mongo_client(config) as client:
            db = get_mongo_db(config, client)
            ensure_mongo_collections_indexed(db)

            if centre_prefix:
                # We are only interested in processing a single centre
                centres = list(filter(lambda config: config.get(CENTRE_KEY_PREFIX) == centre_prefix, centres))
            else:
                # We should only include centres that are to be batch processed
                centres = list(filter(lambda config: config.get(CENTRE_KEY_INCLUDE_IN_SCHEDULED_RUNS, True), centres))

            centres_instances = [Centre(config, centre_config) for centre_config in centres]

            for centre_instance in centres_instances:
                logger.info("*" * 80)
                logger.info(f"Processing {centre_instance.centre_config[CENTRE_KEY_NAME]}")

                try:
                    if sftp:
                        centre_instance.download_csv_files()

                    centre_instance.process_files(add_to_dart)
                except Exception as e:
                    logger.error(f"Error in centre '{centre_instance.centre_config[CENTRE_KEY_NAME]}'")
                    logger.exception(e)
                finally:
                    if not keep_files and centre_instance.is_download_dir_walkable:
                        centre_instance.clean_up()

                # Prioritisation of samples
                update_priority_samples(db, config, add_to_dart)

        logger.info(f"Import complete in {round(time.time() - start, 2)}s")
        logger.info("=" * 80)
    except Exception as e:
        logger.exception(e)
Exemple #9
0
def update_mongo(config: Config, updated_at: datetime) -> None:
    with create_mongo_client(config) as client:
        mongo_db = get_mongo_db(config, client)

        samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES)

        counter = 0
        for mysql_sample in mysql_sample_generator(
                config=config,
                query=
                f"SELECT * FROM lighthouse_sample WHERE updated_at > '{updated_at.strftime('%Y-%m-%d %H:%M')}'",
        ):
            mlwh_sample_uuid = mysql_sample.get(MLWH_LH_SAMPLE_UUID)

            if mlwh_sample_uuid is None:
                continue

            mongo_sample = samples_collection.find_one_and_update(
                filter={
                    FIELD_MONGODB_ID:
                    ObjectId(mysql_sample.get(MLWH_MONGODB_ID)),
                    FIELD_LH_SAMPLE_UUID: {
                        "$ne": mlwh_sample_uuid,
                    },
                },
                update={
                    "$set": {
                        FIELD_LH_SAMPLE_UUID: mlwh_sample_uuid,
                        UUID_UPDATED: True,
                        FIELD_UPDATED_AT: datetime.utcnow(),
                    }
                },
            )

            if mongo_sample is not None:
                counter += 1

            if counter > 0 and (counter % 5000) == 0:
                logger.debug(f"{counter = }")

        logger.debug(f"{counter} samples updated in mongo")
def mongo_samples_by_date(config: Config, start_datetime: datetime,
                          end_datetime: datetime) -> List[SampleDoc]:
    """Gets all samples from Mongo created before Crawler started setting filtered positive fields

    Arguments:
        config {Config} -- application config specifying database details
        start_datetime {datetime} -- lower limit of sample creation date
        end_datetime {datetime} -- upper limit of sample creation date
    Returns:
        List[Sample] -- List of Mongo samples created before filtered positive Crawler changes
    """
    with create_mongo_client(config) as client:
        mongo_db = get_mongo_db(config, client)
        samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES)
        return list(
            samples_collection.find({
                FIELD_CREATED_AT: {
                    "$gte": start_datetime,
                    "$lt": end_datetime
                },
            }))
Exemple #11
0
def get_centres_config(config: Config, data_source: str = "") -> List[CentreConf]:
    """Get the centres config from MongoDB. If MongoDB does not contain any centres config, it will become populated
    with the values in the app config for centres.

    Arguments:
        config {Config}: The configuration object for the whole application.
        data_source {str}: The data source filter to apply to centre configs, or None to apply no filter.

    Return:
        List[CentreConf]: A List of CentreConf from MongoDB matching the given data source.
    """
    with create_mongo_client(config) as client:
        db = get_mongo_db(config, client)

        centres_collection_exists = collection_exists(db, COLLECTION_CENTRES)
        centres_collection = get_mongo_collection(db, COLLECTION_CENTRES)

        if not centres_collection_exists:
            # Populate the centres collection from the config values
            create_index(centres_collection, FIELD_CENTRE_NAME, unique=True)
            populate_mongo_collection(centres_collection, config.CENTRES, FIELD_CENTRE_NAME)  # type: ignore

        # Get the centres collection from MongoDB
        cursor = centres_collection.find()
        centres = list(map(lambda x: cast(CentreConf, x), cursor))

        if data_source:

            def test_data_source(centre):
                try:
                    return centre.get(CENTRE_KEY_DATA_SOURCE).lower() == data_source.lower()
                except (AttributeError):
                    return False

            centres = list(filter(test_data_source, centres))

        return centres
def migrate_all_dbs(config: Config,
                    s_start_datetime: str = "",
                    s_end_datetime: str = "") -> None:
    if not config:
        logger.error("Aborting run: Config required")
        return

    if not valid_datetime_string(s_start_datetime):
        logger.error(
            "Aborting run: Expected format of Start datetime is YYMMDD_HHmm")
        return

    if not valid_datetime_string(s_end_datetime):
        logger.error(
            "Aborting run: Expected format of End datetime is YYMMDD_HHmm")
        return

    start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT)
    end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT)

    if start_datetime > end_datetime:
        logger.error(
            "Aborting run: End datetime must be greater than Start datetime")
        return

    logger.info(
        f"Starting DART update process with Start datetime {start_datetime} and End datetime {end_datetime}"
    )

    try:
        mongo_docs_for_sql = []

        # open connection to mongo
        with create_mongo_client(config) as client:
            mongo_db = get_mongo_db(config, client)

            samples_collection = get_mongo_collection(mongo_db,
                                                      COLLECTION_SAMPLES)

            # 1. get samples from mongo between these time ranges
            samples = get_samples(samples_collection, start_datetime,
                                  end_datetime)

            if not samples:
                logger.info("No samples in this time range.")
                return

            logger.debug(f"{len(samples)} samples to process")

            root_sample_ids, plate_barcodes = extract_required_cp_info(samples)

            logger.debug(f"{len(plate_barcodes)} unique plate barcodes")

            # 2. of these, find which have been cherry-picked and remove them from the list
            cp_samples_df = get_cherrypicked_samples(config,
                                                     list(root_sample_ids),
                                                     list(plate_barcodes))

            if cp_samples_df is None:  # we need to check if it is None explicitly
                raise Exception(
                    "Unable to determine cherry-picked sample - potentially error connecting to MySQL"
                )

            # get the samples between those dates minus the cherry-picked ones
            if cp_samples_df is not None and not cp_samples_df.empty:
                # we need a list of cherry-picked samples with their respective plate barcodes
                cp_samples = cp_samples_df[[
                    FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE
                ]].to_numpy().tolist()

                logger.debug(
                    f"{len(cp_samples)} cherry-picked samples in this timeframe"
                )

                samples = remove_cherrypicked_samples(samples, cp_samples)
            else:
                logger.debug("No cherry-picked samples in this timeframe")

            logger.info(
                f"{len(samples)} samples between these timestamps and not cherry-picked"
            )

            # 3. add the UUID fields if not present
            add_sample_uuid_field(samples)

            # update the samples with source plate UUIDs
            samples_updated_with_source_plate_uuids(mongo_db, samples)

            # 4. update samples in mongo updated in either of the above two steps (would expect the same set of samples
            #       from both steps)
            logger.info("Updating Mongo...")
            _ = update_mongo_fields(mongo_db, samples)
            logger.info("Finished updating Mongo")

        # convert mongo field values into MySQL format
        for sample in samples:
            mongo_docs_for_sql.append(
                map_mongo_sample_to_mysql(sample, copy_date=True))

        if (num_sql_docs := len(mongo_docs_for_sql)) > 0:
            logger.info(
                f"Updating MLWH database for {num_sql_docs} sample documents")
            # create connection to the MLWH database
            with create_mysql_connection(config, False) as mlwh_conn:
                # 5. update the MLWH (should be an idempotent operation)
                run_mysql_executemany_query(mlwh_conn,
                                            SQL_MLWH_MULTIPLE_INSERT,
                                            mongo_docs_for_sql)

            # 6. add all the plates with non-cherrypicked samples (determined in step 2) to DART, as well as any
            #       positive samples in these plates
            update_dart_fields(config, samples)
        else:
    def _record_samples_in_mongo_db(self,
                                    session: ClientSession) -> ExportResult:
        message_uuid = self._message.message_uuid.value
        LOGGER.debug(
            f"Attempting to insert {self._message.total_samples} "
            f"samples from message with UUID {message_uuid} into mongo...")

        try:
            try:
                session_database = get_mongo_db(self._config, session.client)
                samples_collection = get_mongo_collection(
                    session_database, COLLECTION_SAMPLES)
                result = samples_collection.insert_many(
                    documents=self._mongo_sample_docs,
                    ordered=False,
                    session=session)
            except BulkWriteError as ex:
                LOGGER.warning(
                    "BulkWriteError: will now establish whether this was because of duplicate samples."
                )

                duplication_errors = list(
                    filter(lambda x: x["code"] == 11000,
                           ex.details["writeErrors"])  # type: ignore
                )

                if len(duplication_errors) == 0:
                    # There weren't any duplication errors so this is not a problem with the message contents!
                    raise

                create_plate_errors = []
                for duplicate in [x["op"] for x in duplication_errors]:
                    create_plate_errors.append(
                        CreatePlateError(
                            type=ErrorType.ExportingSampleAlreadyExists,
                            origin=RABBITMQ_CREATE_FEEDBACK_ORIGIN_SAMPLE,
                            description=
                            (f"Sample with UUID '{duplicate[FIELD_LH_SAMPLE_UUID]}' was unable to be inserted "
                             "because another sample already exists with "
                             f"Lab ID = '{duplicate[FIELD_MONGO_LAB_ID]}'; "
                             f"Root Sample ID = '{duplicate[FIELD_MONGO_ROOT_SAMPLE_ID]}'; "
                             f"RNA ID = '{duplicate[FIELD_MONGO_RNA_ID]}'; "
                             f"Result = '{duplicate[FIELD_MONGO_RESULT]}'"),
                            sample_uuid=duplicate[FIELD_LH_SAMPLE_UUID],
                        ))

                return ExportResult(success=False,
                                    create_plate_errors=create_plate_errors)
        except Exception as ex:
            LOGGER.critical(
                f"Error accessing MongoDB during export of samples for message UUID '{message_uuid}': {ex}"
            )
            LOGGER.exception(ex)

            raise TransientRabbitError(
                f"There was an error updating MongoDB while exporting samples for message UUID '{message_uuid}'."
            )

        self._samples_inserted = len(result.inserted_ids)
        LOGGER.info(f"{self._samples_inserted} samples inserted into mongo.")

        return ExportResult(success=True, create_plate_errors=[])
Exemple #14
0
def run(sftp: bool,
        keep_files: bool,
        add_to_dart: bool,
        settings_module: str = "") -> None:
    try:
        start = time.time()
        config, settings_module = get_config(settings_module)

        logging.config.dictConfig(config.LOGGING)

        logger.info("-" * 80)
        logger.info("START")
        logger.info(f"Using settings from {settings_module}")

        centres = config.CENTRES

        with create_mongo_client(config) as client:
            db = get_mongo_db(config, client)

            # get or create the centres collection
            centres_collection = get_mongo_collection(db, COLLECTION_CENTRES)

            logger.debug(
                f"Creating index '{FIELD_CENTRE_NAME}' on '{centres_collection.full_name}'"
            )
            centres_collection.create_index(FIELD_CENTRE_NAME, unique=True)
            populate_collection(centres_collection, centres, FIELD_CENTRE_NAME)

            # get or create the source plates collection
            source_plates_collection = get_mongo_collection(
                db, COLLECTION_SOURCE_PLATES)

            logger.debug(
                f"Creating index '{FIELD_BARCODE}' on '{source_plates_collection.full_name}'"
            )
            source_plates_collection.create_index(FIELD_BARCODE, unique=True)

            logger.debug(
                f"Creating index '{FIELD_LH_SOURCE_PLATE_UUID}' on '{source_plates_collection.full_name}'"
            )
            source_plates_collection.create_index(FIELD_LH_SOURCE_PLATE_UUID,
                                                  unique=True)

            with samples_collection_accessor(
                    db, COLLECTION_SAMPLES) as samples_collection:
                # Index on plate barcode to make it easier to select based on plate barcode
                logger.debug(
                    f"Creating index '{FIELD_PLATE_BARCODE}' on '{samples_collection.full_name}'"
                )
                samples_collection.create_index(FIELD_PLATE_BARCODE)

                # Index on result column to make it easier to select the positives
                logger.debug(
                    f"Creating index '{FIELD_RESULT}' on '{samples_collection.full_name}'"
                )
                samples_collection.create_index(FIELD_RESULT)

                # Index on unique combination of columns
                logger.debug(
                    f"Creating compound index on '{samples_collection.full_name}'"
                )
                # create compound index on 'Root Sample ID', 'RNA ID', 'Result', 'Lab ID' - some
                # data had the same plate tested at another time so ignore the data if it is exactly
                # the same
                samples_collection.create_index(
                    [
                        (FIELD_ROOT_SAMPLE_ID, pymongo.ASCENDING),
                        (FIELD_RNA_ID, pymongo.ASCENDING),
                        (FIELD_RESULT, pymongo.ASCENDING),
                        (FIELD_LAB_ID, pymongo.ASCENDING),
                    ],
                    unique=True,
                )

                # Index on lh_source_plate_uuid column
                # Added to make lighthouse API source completion event call query more efficient
                logger.debug(
                    f"Creating index '{FIELD_LH_SOURCE_PLATE_UUID}' on '{samples_collection.full_name}'"
                )
                samples_collection.create_index(FIELD_LH_SOURCE_PLATE_UUID)

                centres_instances = [
                    Centre(config, centre_config) for centre_config in centres
                ]
                for centre_instance in centres_instances:
                    logger.info("*" * 80)
                    logger.info(
                        f"Processing {centre_instance.centre_config['name']}")

                    try:
                        if sftp:
                            centre_instance.download_csv_files()

                        centre_instance.process_files(add_to_dart)
                    except Exception as e:
                        logger.error("An exception occured")
                        logger.error(
                            f"Error in centre {centre_instance.centre_config['name']}"
                        )
                        logger.exception(e)
                    finally:
                        if not keep_files and centre_instance.is_download_dir_walkable:
                            centre_instance.clean_up()

        logger.info(f"Import complete in {round(time.time() - start, 2)}s")
        logger.info("=" * 80)
    except Exception as e:
        logger.exception(e)
def update_mlwh_with_legacy_samples(config: Config,
                                    s_start_datetime: str = "",
                                    s_end_datetime: str = "") -> None:
    if not valid_datetime_string(s_start_datetime):
        print("Aborting run: Expected format of Start datetime is YYMMDD_HHmm")
        return

    if not valid_datetime_string(s_end_datetime):
        print("Aborting run: Expected format of End datetime is YYMMDD_HHmm")
        return

    start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT)
    end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT)

    if start_datetime > end_datetime:
        print("Aborting run: End datetime must be greater than Start datetime")
        return

    print(
        f"Starting MLWH update process with Start datetime {start_datetime} and End datetime {end_datetime}"
    )

    try:
        mongo_docs_for_sql = []
        number_docs_found = 0

        # open connection mongo
        with create_mongo_client(config) as client:
            mongo_db = get_mongo_db(config, client)

            samples_collection = get_mongo_collection(mongo_db,
                                                      COLLECTION_SAMPLES)

            print("Selecting Mongo samples")

            # this should take everything from the cursor find into RAM memory (assuming you have
            # enough memory)
            mongo_docs = list(
                samples_collection.find({
                    FIELD_CREATED_AT: {
                        "$gte": start_datetime,
                        "$lte": end_datetime
                    }
                }))
            number_docs_found = len(mongo_docs)
            print(
                f"{number_docs_found} documents found in the mongo database between these timestamps"
            )

            # convert mongo field values into MySQL format
            for doc in mongo_docs:
                mongo_docs_for_sql.append(
                    map_mongo_sample_to_mysql(doc, copy_date=True))

        if number_docs_found > 0:
            print(
                f"Updating MLWH database for {len(mongo_docs_for_sql)} sample documents"
            )
            # create connection to the MLWH database
            with create_mysql_connection(config, False) as mlwh_conn:

                # execute sql query to insert/update timestamps into MLWH
                run_mysql_executemany_query(mlwh_conn,
                                            SQL_MLWH_MULTIPLE_INSERT,
                                            mongo_docs_for_sql)
        else:
            print(
                "No documents found for this timestamp range, nothing to insert or update in MLWH"
            )

    except Exception:
        print_exception()
Exemple #16
0
def test_get_mongo_db(mongo_client):
    config, mongo_client = mongo_client

    assert type(get_mongo_db(config, mongo_client)) == Database
Exemple #17
0
def setup_mongo_indexes(config):
    with create_mongo_client(config) as client:
        db = get_mongo_db(config, client)
        ensure_mongo_collections_indexed(db)
def update_mongo_filtered_positive_fields(config: Config,
                                          samples: List[SampleDoc],
                                          version: str,
                                          update_timestamp: datetime) -> bool:
    """Batch updates sample filtered positive fields in the Mongo database

    Arguments:
        config {Config} -- application config specifying database details
        samples {List[Sample]} -- the list of samples whose filtered positive fields should be updated
        version {str} -- the filtered positive identifier version used
        update_timestamp {datetime} -- the timestamp at which the update was performed

    Returns:
        bool -- whether the updates completed successfully
    """
    with create_mongo_client(config) as client:
        mongo_db = get_mongo_db(config, client)
        samples_collection = get_mongo_collection(mongo_db, COLLECTION_SAMPLES)

        num_samples = len(samples)
        SAMPLES_PER_QUERY = 15000
        samples_index = 0
        logger.debug(
            f"Attempting to update {num_samples} rows in Mongo in batches of {SAMPLES_PER_QUERY}"
        )
        while samples_index < num_samples:
            logger.debug(
                f"Updating records between {samples_index} and {samples_index + SAMPLES_PER_QUERY}"
            )

            samples_batch = samples[samples_index:(
                samples_index + SAMPLES_PER_QUERY)]  # noqa: E203

            # get ids of those that are filtered positive, and those that aren't
            filtered_positive_ids = []
            filtered_negative_ids = []
            for sample in samples_batch:
                if sample[FIELD_FILTERED_POSITIVE] is True:
                    filtered_positive_ids.append(sample[FIELD_MONGODB_ID])
                else:
                    filtered_negative_ids.append(sample[FIELD_MONGODB_ID])

            samples_collection.update_many(
                {FIELD_MONGODB_ID: {
                    "$in": filtered_positive_ids
                }},
                {
                    "$set": {
                        FIELD_FILTERED_POSITIVE: True,
                        FIELD_FILTERED_POSITIVE_VERSION: version,
                        FIELD_FILTERED_POSITIVE_TIMESTAMP: update_timestamp,
                    }
                },
            )

            samples_collection.update_many(
                {FIELD_MONGODB_ID: {
                    "$in": filtered_negative_ids
                }},
                {
                    "$set": {
                        FIELD_FILTERED_POSITIVE: False,
                        FIELD_FILTERED_POSITIVE_VERSION: version,
                        FIELD_FILTERED_POSITIVE_TIMESTAMP: update_timestamp,
                    }
                },
            )

            samples_index += SAMPLES_PER_QUERY
        return True
Exemple #19
0
def run(settings_module: str = "") -> None:
    config, settings_module = get_config(settings_module)

    with create_mongo_client(config) as client:
        db = get_mongo_db(config, client)
        sample_timestamps_helper.add_timestamps_to_samples(db)
    def _mongo_db(self) -> Database:
        if not hasattr(self, "__mongo_db"):
            client = create_mongo_client(self._config)
            self.__mongo_db = get_mongo_db(self._config, client)

        return self.__mongo_db