Exemple #1
0
def _derive_files_from_upload(trial_id: str, upload_type: str, session):
    # Get trial metadata JSON for the associated trial
    trial_record: TrialMetadata = TrialMetadata.find_by_trial_id(
        trial_id, session=session)

    # Run the file derivation
    derivation_context = unprism.DeriveFilesContext(trial_record.metadata_json,
                                                    upload_type,
                                                    fetch_artifact)
    derivation_result = unprism.derive_files(derivation_context)

    # TODO: consider parallelizing this step if necessary
    for artifact in derivation_result.artifacts:
        # Save to GCS
        blob = upload_to_data_bucket(artifact.object_url, artifact.data)

        # Build basic facet group
        facet_group = f"{artifact.data_format}|{artifact.file_type}"

        # Save to database
        df_record = DownloadableFiles.create_from_blob(
            trial_record.trial_id,
            artifact.file_type,
            artifact.data_format,
            facet_group,
            blob,
            session=session,
            alert_artifact_upload=True,
        )
        df_record.additional_metadata = artifact.metadata
        # Assume that a derived file will be directly useful for data analysis
        df_record.analysis_friendly = True

    # Update the trial metadata blob (in case the file derivation modified it)
    trial_record.metadata_json = derivation_result.trial_metadata

    session.commit()
def test_create_downloadable_file_from_blob(clean_db, monkeypatch):
    """Try to create a downloadable file from a GCS blob"""
    fake_blob = MagicMock()
    fake_blob.name = "name"
    fake_blob.md5_hash = "12345"
    fake_blob.crc32c = "54321"
    fake_blob.size = 5
    fake_blob.time_created = datetime.now()

    clean_db.add(
        TrialMetadata(
            trial_id="id",
            metadata_json={
                "protocol_identifier": "id",
                "allowed_collection_event_names": [],
                "allowed_cohort_names": [],
                "participants": [],
            },
        )
    )
    df = DownloadableFiles.create_from_blob(
        "id", "pbmc", "Shipping Manifest", "pbmc/shipping", fake_blob
    )

    # Mock artifact upload publishing
    publisher = MagicMock()
    monkeypatch.setattr("cidc_api.models.models.publish_artifact_upload", publisher)

    # Check that the file was created
    assert 1 == clean_db.query(DownloadableFiles).count()
    df_lookup = DownloadableFiles.find_by_id(df.id)
    assert df_lookup.object_url == fake_blob.name
    assert df_lookup.data_format == "Shipping Manifest"
    assert df_lookup.file_size_bytes == fake_blob.size
    assert df_lookup.md5_hash == fake_blob.md5_hash
    assert df_lookup.crc32c_hash == fake_blob.crc32c

    # uploading second time to check non duplicating entries
    fake_blob.size = 6
    fake_blob.md5_hash = "6"
    df = DownloadableFiles.create_from_blob(
        "id", "pbmc", "Shipping Manifest", "pbmc/shipping", fake_blob
    )

    # Check that the file was created
    assert 1 == clean_db.query(DownloadableFiles).count()
    df_lookup = DownloadableFiles.find_by_id(df.id)
    assert df_lookup.file_size_bytes == 6
    assert df_lookup.md5_hash == "6"

    # Check that no artifact upload event was published
    publisher.assert_not_called()

    # Check that artifact upload publishes
    DownloadableFiles.create_from_blob(
        "id",
        "pbmc",
        "Shipping Manifest",
        "pbmc/shipping",
        fake_blob,
        alert_artifact_upload=True,
    )
    publisher.assert_called_once_with(fake_blob.name)
def ingest_upload(event: dict, context: BackgroundContext):
    """
    When a successful upload event is published, move the data associated
    with the upload job into the download bucket and merge the upload metadata
    into the appropriate clinical trial JSON.
    """
    storage_client = storage.Client()

    job_id = int(extract_pubsub_data(event))

    logger.info(f"ingest_upload execution started on upload job id {job_id}")

    with sqlalchemy_session() as session:
        job: UploadJobs = UploadJobs.find_by_id(job_id, session=session)

        # Check ingestion pre-conditions
        if not job:
            raise Exception(f"No assay upload job with id {job_id} found.")
        if UploadJobStatus(job.status) != UploadJobStatus.UPLOAD_COMPLETED:
            raise Exception(
                f"Received ID for job with status {job.status}. Aborting ingestion."
            )
        trial_id = job.metadata_patch.get(prism.PROTOCOL_ID_FIELD_NAME)
        if not trial_id:
            # We should never hit this, since metadata should be pre-validated.
            with saved_failure_status(job, session):
                raise Exception(
                    f"Invalid assay metadata: missing protocol identifier ({prism.PROTOCOL_ID_FIELD_NAME})."
                )

        logger.info(
            f"Found completed upload job (job_id={job_id}) with uploader {job.uploader_email}"
        )

        url_bundles = [
            URLBundle(*bundle) for bundle in job.upload_uris_with_data_uris_with_uuids()
        ]

        # Copy GCS blobs in parallel
        logger.info("Copying artifacts from upload bucket to data bucket.")
        with ThreadPoolExecutor(THREADPOOL_THREADS) as executor, saved_failure_status(
            job, session
        ):
            destination_objects = executor.map(
                lambda url_bundle: _gcs_copy(
                    storage_client,
                    GOOGLE_UPLOAD_BUCKET,
                    url_bundle.upload_url,
                    GOOGLE_DATA_BUCKET,
                    url_bundle.target_url,
                ),
                url_bundles,
            )

        metadata_patch = job.metadata_patch
        logger.info("Adding artifact metadata to metadata patch.")
        metadata_patch, downloadable_files = TrialMetadata.merge_gcs_artifacts(
            metadata_patch,
            job.upload_type,
            zip([ub.artifact_uuid for ub in url_bundles], destination_objects),
        )

        # Add metadata for this upload to the database
        logger.info(
            "Merging metadata from upload %d into trial %s: " % (job.id, trial_id),
            metadata_patch,
        )
        with saved_failure_status(job, session):
            trial = TrialMetadata.patch_assays(
                trial_id, metadata_patch, session=session
            )

        # Save downloadable files to the database
        # NOTE: this needs to happen after TrialMetadata.patch_assays
        # in order to avoid violating a foreign-key constraint on the trial_id
        # in the event that this is the first upload for a trial.
        logger.info("Saving artifact records to the downloadable_files table.")
        for artifact_metadata, additional_metadata in downloadable_files:
            logger.debug(
                f"Saving metadata to downloadable_files table: {artifact_metadata}"
            )
            DownloadableFiles.create_from_metadata(
                trial_id,
                job.upload_type,
                artifact_metadata,
                additional_metadata=additional_metadata,
                session=session,
                commit=False,
            )

        # Additionally, make the metadata xlsx a downloadable file
        with saved_failure_status(job, session):
            _, xlsx_blob = _get_bucket_and_blob(
                storage_client, GOOGLE_DATA_BUCKET, job.gcs_xlsx_uri
            )
            full_uri = f"gs://{GOOGLE_DATA_BUCKET}/{xlsx_blob.name}"
            data_format = "Assay Metadata"
            facet_group = f"{job.upload_type}|{data_format}"
            logger.info(f"Saving {full_uri} as a downloadable_file.")
            DownloadableFiles.create_from_blob(
                trial_id,
                job.upload_type,
                data_format,
                facet_group,
                xlsx_blob,
                session=session,
            )

        # Update the job metadata to include artifacts
        job.metadata_patch = metadata_patch

        # Making files downloadable by a specified biofx analysis team group
        assay_prefix = job.upload_type.split("_")[0]  # 'wes_bam' -> 'wes'
        if assay_prefix in GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT:
            analysis_group_email = GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT[assay_prefix]
            _gcs_add_prefix_reader_permission(
                storage_client,
                analysis_group_email,  # to whom give access to
                f"{trial_id}/{assay_prefix}",  # to what sub-folder
            )

        # Save the upload success and trigger email alert if transaction succeeds
        job.ingestion_success(trial, session=session, send_email=True, commit=True)

        # Trigger post-processing on uploaded data files
        logger.info(f"Publishing object URLs to 'artifact_upload' topic")
        with ThreadPoolExecutor(THREADPOOL_THREADS) as executor:
            executor.map(
                lambda url_bundle: publish_artifact_upload(url_bundle.target_url),
                url_bundles,
            )

        # Trigger post-processing on entire upload
        report = _encode_and_publish(str(job.id), GOOGLE_ASSAY_OR_ANALYSIS_UPLOAD_TOPIC)
        if report:
            report.result()

    # Google won't actually do anything with this response; it's
    # provided for testing purposes only.
    return jsonify(
        dict((bundle.upload_url, bundle.target_url) for bundle in url_bundles)
    )