Example #1
0
def validate_args(config: Config, s_start_datetime: str, s_end_datetime: str,
                  updated_at: str) -> Tuple[datetime, datetime, datetime]:
    base_msg = "Aborting run: "
    if not config:
        msg = f"{base_msg} Config required"
        logger.error(msg)
        raise Exception(msg)

    if not valid_datetime_string(s_start_datetime):
        msg = f"{base_msg} Expected format of Start datetime is YYMMDD_HHmm"
        logger.error(msg)
        raise Exception(msg)

    if not valid_datetime_string(s_end_datetime):
        msg = f"{base_msg} Expected format of End datetime is YYMMDD_HHmm"
        logger.error(msg)
        raise Exception(msg)

    if not valid_datetime_string(updated_at):
        msg = f"{base_msg} Expected format of updated_at datetime is YYMMDD_HHmm"
        logger.error(msg)
        raise Exception(msg)

    start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT)
    end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT)
    updated_at_datetime = datetime.strptime(updated_at, MONGO_DATETIME_FORMAT)

    if start_datetime > end_datetime:
        msg = f"{base_msg} End datetime must be greater than Start datetime"
        logger.error(msg)
        raise Exception(msg)

    return start_datetime, end_datetime, updated_at_datetime
def run(settings_module: str = "",
        s_start_datetime: str = "",
        s_end_datetime: str = "") -> None:
    """Migrate the existing samples to have the filtered positive values.

    Arguments:
        settings_module {str} -- settings module from which to generate the app config
    """
    if not valid_datetime_string(s_start_datetime):
        logger.error(
            "Aborting run: Expected format of Start datetime is YYMMDD_HHmm")
        return

    if not valid_datetime_string(s_end_datetime):
        logger.error(
            "Aborting run: Expected format of End datetime is YYMMDD_HHmm")
        return

    start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT)
    end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT)
    fields_set_datetime = datetime.strptime(FILTERED_POSITIVE_FIELDS_SET_DATE,
                                            "%Y-%m-%d")

    if start_datetime > end_datetime:
        logger.error(
            "Aborting run: End datetime must be greater than Start datetime")
        return

    if end_datetime > fields_set_datetime:
        logger.error(
            "Aborting run: Date range must be prior to the 17th December")
        return

    config, settings_module = get_config(settings_module)

    logging.config.dictConfig(config.LOGGING)

    logger.info("-" * 80)
    logger.info("STARTING FILTERED POSITIVES LEGACY UPDATE")
    logger.info(f"Time start: {datetime.now()}")
    start_time = time.time()

    updated_key = "Updated"
    time_key = "Time taken"

    mongo_versions_updated = {
        FILTERED_POSITIVE_VERSION_0: {
            updated_key: False,
            time_key: 0.0
        },
        FILTERED_POSITIVE_VERSION_1: {
            updated_key: False,
            time_key: 0.0
        },
        FILTERED_POSITIVE_VERSION_2: {
            updated_key: False,
            time_key: 0.0
        },
    }

    mlwh_versions_updated = {
        FILTERED_POSITIVE_VERSION_0: {
            updated_key: False,
            time_key: 0.0
        },
        FILTERED_POSITIVE_VERSION_1: {
            updated_key: False,
            time_key: 0.0
        },
        FILTERED_POSITIVE_VERSION_2: {
            updated_key: False,
            time_key: 0.0
        },
    }

    try:
        continue_migration = pre_migration_filtered_positive_check(
            config, start_datetime, end_datetime)

        if continue_migration:
            logger.info(
                f"Selecting legacy samples from Mongo between {start_datetime} and {end_datetime}..."
            )
            samples = mongo_samples_by_date(config, start_datetime,
                                            end_datetime)

            legacy_samples_num = len(samples)
            logger.info(f"{legacy_samples_num} samples found from Mongo")

            root_sample_ids, plate_barcodes = extract_required_cp_info(samples)

            logger.info("Querying for v0 cherrypicked samples from MLWH")
            # Get v0 cherrypicked samples
            v0_cp_samples_df = get_cherrypicked_samples_by_date(
                config,
                list(root_sample_ids),
                list(plate_barcodes),
                "1970-01-01 00:00:01",
                V0_V1_CUTOFF_TIMESTAMP,
            )

            logger.debug(
                f"Found {len(v0_cp_samples_df.index)} v0 cherrypicked samples"
            )  # type: ignore

            logger.info("Querying for cherrypicked samples from MLWH")
            # Get v1 cherrypicked samples
            v1_cp_samples_df = get_cherrypicked_samples_by_date(
                config,
                list(root_sample_ids),
                list(plate_barcodes),
                V0_V1_CUTOFF_TIMESTAMP,
                V1_V2_CUTOFF_TIMESTAMP,
            )

            logger.debug(
                f"Found {len(v1_cp_samples_df.index)} v1 cherrypicked samples"
            )  # type: ignore

            logger.info("Splitting samples by version...")
            samples_by_version = split_mongo_samples_by_version(
                samples, v0_cp_samples_df, v1_cp_samples_df)

            update_timestamp = datetime.now()

            for version, version_samples in samples_by_version.items():
                filtered_positive_identifier = filtered_positive_identifier_by_version(
                    version)
                logger.info(f"Updating {version} filtered positives...")
                update_filtered_positive_fields(
                    filtered_positive_identifier,
                    version_samples,
                    version,
                    update_timestamp,
                )

            logger.info("Updated filtered positives")

            logger.info("Updating Mongo")

            for version, version_samples in samples_by_version.items():
                logger.info(
                    f"Updating {version} filtered positives in Mongo, total {len(version_samples)} records..."
                )
                mongo_update_start_time = time.time()
                mongo_updated = update_mongo_filtered_positive_fields(
                    config,
                    version_samples,
                    version,
                    update_timestamp,
                )
                if mongo_updated:
                    logger.info(
                        f"Finished updating {version} filtered positives in Mongo"
                    )

                    mongo_update_end_time = time.time()
                    mongo_versions_updated[version][updated_key] = True
                    mongo_versions_updated[version][time_key] = round(
                        mongo_update_end_time - mongo_update_start_time, 2)

                    logger.info(
                        f"Updating {version} filtered positives in MLWH...")
                    mlwh_update_start_time = time.time()

                    mlwh_updated = update_mlwh_filtered_positive_fields_batched(
                        config, version_samples, version, update_timestamp)

                    if mlwh_updated:
                        logger.info(
                            f"Finished updating {version} filtered positives in MLWH"
                        )

                        mlwh_update_end_time = time.time()
                        mlwh_versions_updated[version][updated_key] = True
                        mlwh_versions_updated[version][time_key] = round(
                            mlwh_update_end_time - mlwh_update_start_time, 2)

            logger.info("Finished updating databases")
        else:
            logger.info("Now exiting migration")
    except Exception as e:
        logger.error("---------- Process aborted: ----------")
        logger.error(f"An exception occurred, at {datetime.now()}")
        logger.exception(e)
        raise
    finally:
        end_time = time.time()
        logger.info(f"""
        ---------- Processing status of filtered positive field migration: ----------
        -- Mongo updated with v0 filtered positives: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_0][updated_key]}, \
time taken: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_0][time_key]}s
        -- Mongo updated with v1 filtered positives: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_1][updated_key]}, \
time taken: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_1][time_key]}s
        -- Mongo updated with v2 filtered positives: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_2][updated_key]}, \
time taken: \
{mongo_versions_updated[FILTERED_POSITIVE_VERSION_2][time_key]}s
        -- MLWH updated with v0 filtered positives: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_0][updated_key]}, \
time taken: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_0][time_key]}s
        -- MLWH updated with v1 filtered positives: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_1][updated_key]}, \
time taken: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_1][time_key]}s
        -- MLWH updated with v2 filtered positives: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_2][updated_key]}, \
time taken: \
{mlwh_versions_updated[FILTERED_POSITIVE_VERSION_2][time_key]}s
        """)

    logger.info(f"Time finished: {datetime.now()}")
    logger.info(f"Migration complete in {round(end_time - start_time, 2)}s")
    logger.info("=" * 80)
def migrate_all_dbs(config: Config,
                    s_start_datetime: str = "",
                    s_end_datetime: str = "") -> None:
    if not config:
        logger.error("Aborting run: Config required")
        return

    if not valid_datetime_string(s_start_datetime):
        logger.error(
            "Aborting run: Expected format of Start datetime is YYMMDD_HHmm")
        return

    if not valid_datetime_string(s_end_datetime):
        logger.error(
            "Aborting run: Expected format of End datetime is YYMMDD_HHmm")
        return

    start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT)
    end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT)

    if start_datetime > end_datetime:
        logger.error(
            "Aborting run: End datetime must be greater than Start datetime")
        return

    logger.info(
        f"Starting DART update process with Start datetime {start_datetime} and End datetime {end_datetime}"
    )

    try:
        mongo_docs_for_sql = []

        # open connection to mongo
        with create_mongo_client(config) as client:
            mongo_db = get_mongo_db(config, client)

            samples_collection = get_mongo_collection(mongo_db,
                                                      COLLECTION_SAMPLES)

            # 1. get samples from mongo between these time ranges
            samples = get_samples(samples_collection, start_datetime,
                                  end_datetime)

            if not samples:
                logger.info("No samples in this time range.")
                return

            logger.debug(f"{len(samples)} samples to process")

            root_sample_ids, plate_barcodes = extract_required_cp_info(samples)

            logger.debug(f"{len(plate_barcodes)} unique plate barcodes")

            # 2. of these, find which have been cherry-picked and remove them from the list
            cp_samples_df = get_cherrypicked_samples(config,
                                                     list(root_sample_ids),
                                                     list(plate_barcodes))

            if cp_samples_df is None:  # we need to check if it is None explicitly
                raise Exception(
                    "Unable to determine cherry-picked sample - potentially error connecting to MySQL"
                )

            # get the samples between those dates minus the cherry-picked ones
            if cp_samples_df is not None and not cp_samples_df.empty:
                # we need a list of cherry-picked samples with their respective plate barcodes
                cp_samples = cp_samples_df[[
                    FIELD_ROOT_SAMPLE_ID, FIELD_PLATE_BARCODE
                ]].to_numpy().tolist()

                logger.debug(
                    f"{len(cp_samples)} cherry-picked samples in this timeframe"
                )

                samples = remove_cherrypicked_samples(samples, cp_samples)
            else:
                logger.debug("No cherry-picked samples in this timeframe")

            logger.info(
                f"{len(samples)} samples between these timestamps and not cherry-picked"
            )

            # 3. add the UUID fields if not present
            add_sample_uuid_field(samples)

            # update the samples with source plate UUIDs
            samples_updated_with_source_plate_uuids(mongo_db, samples)

            # 4. update samples in mongo updated in either of the above two steps (would expect the same set of samples
            #       from both steps)
            logger.info("Updating Mongo...")
            _ = update_mongo_fields(mongo_db, samples)
            logger.info("Finished updating Mongo")

        # convert mongo field values into MySQL format
        for sample in samples:
            mongo_docs_for_sql.append(
                map_mongo_sample_to_mysql(sample, copy_date=True))

        if (num_sql_docs := len(mongo_docs_for_sql)) > 0:
            logger.info(
                f"Updating MLWH database for {num_sql_docs} sample documents")
            # create connection to the MLWH database
            with create_mysql_connection(config, False) as mlwh_conn:
                # 5. update the MLWH (should be an idempotent operation)
                run_mysql_executemany_query(mlwh_conn,
                                            SQL_MLWH_MULTIPLE_INSERT,
                                            mongo_docs_for_sql)

            # 6. add all the plates with non-cherrypicked samples (determined in step 2) to DART, as well as any
            #       positive samples in these plates
            update_dart_fields(config, samples)
        else:
def update_mlwh_with_legacy_samples(config: Config,
                                    s_start_datetime: str = "",
                                    s_end_datetime: str = "") -> None:
    if not valid_datetime_string(s_start_datetime):
        print("Aborting run: Expected format of Start datetime is YYMMDD_HHmm")
        return

    if not valid_datetime_string(s_end_datetime):
        print("Aborting run: Expected format of End datetime is YYMMDD_HHmm")
        return

    start_datetime = datetime.strptime(s_start_datetime, MONGO_DATETIME_FORMAT)
    end_datetime = datetime.strptime(s_end_datetime, MONGO_DATETIME_FORMAT)

    if start_datetime > end_datetime:
        print("Aborting run: End datetime must be greater than Start datetime")
        return

    print(
        f"Starting MLWH update process with Start datetime {start_datetime} and End datetime {end_datetime}"
    )

    try:
        mongo_docs_for_sql = []
        number_docs_found = 0

        # open connection mongo
        with create_mongo_client(config) as client:
            mongo_db = get_mongo_db(config, client)

            samples_collection = get_mongo_collection(mongo_db,
                                                      COLLECTION_SAMPLES)

            print("Selecting Mongo samples")

            # this should take everything from the cursor find into RAM memory (assuming you have
            # enough memory)
            mongo_docs = list(
                samples_collection.find({
                    FIELD_CREATED_AT: {
                        "$gte": start_datetime,
                        "$lte": end_datetime
                    }
                }))
            number_docs_found = len(mongo_docs)
            print(
                f"{number_docs_found} documents found in the mongo database between these timestamps"
            )

            # convert mongo field values into MySQL format
            for doc in mongo_docs:
                mongo_docs_for_sql.append(
                    map_mongo_sample_to_mysql(doc, copy_date=True))

        if number_docs_found > 0:
            print(
                f"Updating MLWH database for {len(mongo_docs_for_sql)} sample documents"
            )
            # create connection to the MLWH database
            with create_mysql_connection(config, False) as mlwh_conn:

                # execute sql query to insert/update timestamps into MLWH
                run_mysql_executemany_query(mlwh_conn,
                                            SQL_MLWH_MULTIPLE_INSERT,
                                            mongo_docs_for_sql)
        else:
            print(
                "No documents found for this timestamp range, nothing to insert or update in MLWH"
            )

    except Exception:
        print_exception()
Example #5
0
def test_valid_datetime_string():
    result1 = valid_datetime_string("")
    assert result1 is False
    result2 = valid_datetime_string("201209_0000")
    assert result2 is True