Esempio n. 1
0
def test_downloadable_files_data_category_prefix():
    """Check that data_category_prefix's are derived as expected"""
    file_w_category = DownloadableFiles(facet_group="/wes/r1_L.fastq.gz")
    assert file_w_category.data_category_prefix == "WES"

    file_no_category = DownloadableFiles()
    assert file_no_category.data_category_prefix == None
def test_upload_manifest_twice(cidc_api, clean_db, monkeypatch):
    """Ensure that doing upload_manifest twice will produce only one DownloadableFiles"""
    user_id = setup_trial_and_user(cidc_api, monkeypatch)
    mocks = UploadMocks(monkeypatch)

    client = cidc_api.test_client()

    grant_upload_permission(user_id, "pbmc", cidc_api)
    make_nci_biobank_user(user_id, cidc_api)

    res = client.post(MANIFEST_UPLOAD,
                      data=form_data("pbmc.xlsx", io.BytesIO(b"a"), "pbmc"))
    assert res.status_code == 200

    # Check that we tried to publish a patient/sample update
    mocks.publish_patient_sample_update.assert_called_once()

    with cidc_api.app_context():
        assert not DownloadableFiles.list()  # manifest is not stored

    # uploading second time
    res = client.post(MANIFEST_UPLOAD,
                      data=form_data("pbmc.xlsx", io.BytesIO(b"b"), "pbmc"))
    assert res.status_code == 200

    assert mocks.upload_xlsx.call_count == 0  # manifest is not stored

    with cidc_api.app_context():
        assert not DownloadableFiles.list()  # manifest is not stored
Esempio n. 3
0
def test_downloadable_files_additional_metadata_default(clean_db):
    TrialMetadata.create(TRIAL_ID, METADATA)
    df = DownloadableFiles(
        trial_id=TRIAL_ID,
        upload_type="wes_bam",
        object_url="10021/Patient 1/sample 1/aliquot 1/wes_forward.fastq",
        file_size_bytes=1,
        md5_hash="hash1234",
        uploaded_timestamp=datetime.now(),
    )

    # Check no value passed
    df.insert()
    assert df.additional_metadata == {}

    for nullish_value in [None, "null", {}]:
        df.additional_metadata = nullish_value
        df.update()
        assert df.additional_metadata == {}

    # Non-nullish value doesn't get overridden
    non_nullish_value = {"foo": "bar"}
    df.additional_metadata = non_nullish_value
    df.update()
    assert df.additional_metadata == non_nullish_value
Esempio n. 4
0
def test_create_downloadable_file_from_metadata(db, monkeypatch):
    """Try to create a downloadable file from artifact_core metadata"""
    # fake file metadata
    file_metadata = {
        "artifact_category": "Assay Artifact from CIMAC",
        "object_url": "10021/Patient 1/sample 1/aliquot 1/wes_forward.fastq",
        "file_name": "wes_forward.fastq",
        "file_size_bytes": 1,
        "md5_hash": "hash1234",
        "uploaded_timestamp": datetime.now(),
        "foo": "bar",  # unsupported column - should be filtered
    }

    # Create the trial (to avoid violating foreign-key constraint)
    TrialMetadata.patch_trial_metadata(TRIAL_ID, METADATA)
    # Create the file
    DownloadableFiles.create_from_metadata(TRIAL_ID, "wes", file_metadata)

    # Check that we created the file
    new_file = (db.query(DownloadableFiles).filter_by(
        file_name=file_metadata["file_name"]).first())
    assert new_file
    del file_metadata["foo"]
    for k in file_metadata.keys():
        assert getattr(new_file, k) == file_metadata[k]
Esempio n. 5
0
def test_create_downloadable_file_from_metadata(clean_db, monkeypatch):
    """Try to create a downloadable file from artifact_core metadata"""
    # fake file metadata
    file_metadata = {
        "object_url": "10021/Patient 1/sample 1/aliquot 1/wes_forward.fastq",
        "file_size_bytes": 1,
        "md5_hash": "hash1234",
        "uploaded_timestamp": datetime.now(),
        "foo": "bar",  # unsupported column - should be filtered
    }
    additional_metadata = {"more": "info"}

    # Mock artifact upload publishing
    publisher = MagicMock()
    monkeypatch.setattr("cidc_api.models.models.publish_artifact_upload", publisher)

    # Create the trial (to avoid violating foreign-key constraint)
    TrialMetadata.create(TRIAL_ID, METADATA)

    # Create files with empty or "null" additional metadata
    for nullish_value in ["null", None, {}]:
        df = DownloadableFiles.create_from_metadata(
            TRIAL_ID, "wes_bam", file_metadata, additional_metadata=nullish_value
        )
        clean_db.refresh(df)
        assert df.additional_metadata == {}

    # Create the file
    DownloadableFiles.create_from_metadata(
        TRIAL_ID, "wes_bam", file_metadata, additional_metadata=additional_metadata
    )

    # Check that we created the file
    new_file = (
        clean_db.query(DownloadableFiles)
        .filter_by(object_url=file_metadata["object_url"])
        .first()
    )
    assert new_file
    del file_metadata["foo"]
    for k in file_metadata.keys():
        assert getattr(new_file, k) == file_metadata[k]
    assert new_file.additional_metadata == additional_metadata

    # Check that no artifact upload event was published
    publisher.assert_not_called()

    # Check that artifact upload publishes
    DownloadableFiles.create_from_metadata(
        TRIAL_ID,
        "wes_bam",
        file_metadata,
        additional_metadata=additional_metadata,
        alert_artifact_upload=True,
    )
    publisher.assert_called_once_with(file_metadata["object_url"])
Esempio n. 6
0
 def create_df(facet_group, additional_metadata={}) -> DownloadableFiles:
     df = DownloadableFiles(
         facet_group=facet_group,
         additional_metadata=additional_metadata,
         trial_id=TRIAL_ID,
         uploaded_timestamp=datetime.now(),
         file_size_bytes=0,
         object_url=facet_group,  # just filler, not relevant to the test
         upload_type="",
     )
     df.insert()
     clean_db.refresh(df)
     return df
Esempio n. 7
0
def test_info_data_overview(cidc_api, clean_db):
    """Check that the data overview has expected structure and values"""
    def insert_trial(trial_id, num_participants, num_samples):
        TrialMetadata(
            trial_id=trial_id,
            metadata_json={
                prism.PROTOCOL_ID_FIELD_NAME:
                trial_id,
                "allowed_cohort_names": [""],
                "allowed_collection_event_names": [""],
                "participants": [{
                    "cimac_participant_id":
                    f"CTTTPP{p}",
                    "participant_id":
                    "x",
                    "samples": [{
                        "cimac_id": f"CTTTPP1SS.0{s}",
                        "sample_location": "",
                        "type_of_primary_container": "Other",
                        "type_of_sample": "Other",
                        "collection_event_name": "",
                        "parent_sample_id": "",
                    } for s in range(num_samples[p])],
                } for p in range(num_participants)],
            },
        ).insert()

    # 3 trials
    # 15 participants
    # 40 samples
    # 3 files
    with cidc_api.app_context():
        insert_trial("1", 6, [0] * 6)
        insert_trial("2", 4, [5, 6, 7, 8])
        insert_trial("3", 5, [3, 2, 1, 1, 7])
        for i in range(3):
            DownloadableFiles(
                trial_id="1",
                upload_type="wes",
                object_url=str(i),
                facet_group=
                "/wes/r2_L.fastq.gz",  # this is what makes this file "related"
                uploaded_timestamp=datetime.now(),
                file_size_bytes=2,
            ).insert()

    client = cidc_api.test_client()

    res = client.get("/info/data_overview")
    assert res.status_code == 200
    assert res.json == {
        "num_assays": len(prism.SUPPORTED_ASSAYS),
        "num_trials": 3,
        "num_participants": 15,
        "num_samples": 40,
        "num_files": 3,
        "num_bytes": 6,
    }
 def make_file(trial_id, object_url, upload_type,
               facet_group) -> DownloadableFiles:
     return DownloadableFiles(
         trial_id=trial_id,
         upload_type=upload_type,
         object_url=f"{trial_id}/{object_url}",
         facet_group=facet_group,
         uploaded_timestamp=datetime.now(),
         file_size_bytes=int(51 * 1e6),  # 51MB
     )
Esempio n. 9
0
def setup_db_records(cidc_api):
    extra = {"_etag": ETAG}
    with cidc_api.app_context():
        Users(**users["json"], **extra).insert(compute_etag=False)
        TrialMetadata(**trial_metadata["json"], **extra).insert(compute_etag=False)
        DownloadableFiles(**downloadable_files["json"], **extra).insert(
            compute_etag=False
        )
        Permissions(**permissions["json"], **extra).insert(compute_etag=False)
        UploadJobs(**upload_jobs["json"], **extra).insert(compute_etag=False)
def test_get_related_files(cidc_api, clean_db, monkeypatch):
    """Check that the related_files endpoint calls `get_related_files`"""
    user_id = setup_user(cidc_api, monkeypatch)
    file_id_1, file_id_2 = setup_downloadable_files(cidc_api)

    client = cidc_api.test_client()

    # Add an additional file that is related to file 1
    object_url = "/foo/bar"
    with cidc_api.app_context():
        DownloadableFiles(
            trial_id=trial_id_1,
            upload_type="wes",
            object_url=object_url,
            facet_group=
            "/wes/r2_L.fastq.gz",  # this is what makes this file "related"
            uploaded_timestamp=datetime.now(),
            file_size_bytes=0,
        ).insert()

    # Non-admins get 401s when requesting related files they don't have permission to view
    res = client.get(f"/downloadable_files/{file_id_1}/related_files")
    assert res.status_code == 401

    # Give the user one permission
    with cidc_api.app_context():
        perm = Permissions(
            granted_to_user=user_id,
            trial_id=trial_id_1,
            upload_type=upload_types[0],
            granted_by_user=user_id,
        )
        perm.insert()

    # Non-admins can get related files that they have permision to view
    res = client.get(f"/downloadable_files/{file_id_1}/related_files")
    assert res.status_code == 200
    assert len(res.json["_items"]) == 1  # file 1 has 1 related file
    assert res.json["_items"][0]["object_url"] == object_url

    # Admins can get related files without permissions
    make_admin(user_id, cidc_api)
    res = client.get(f"/downloadable_files/{file_id_2}/related_files")
    assert res.status_code == 200
    assert len(res.json["_items"]) == 0  # file 2 has 0 related file
Esempio n. 11
0
def vis_preprocessing(event: dict, context: BackgroundContext):
    with sqlalchemy_session() as session:
        object_url = extract_pubsub_data(event)
        file_record: DownloadableFiles = DownloadableFiles.get_by_object_url(
            object_url, session=session)

        if not file_record:
            raise Exception(
                f"No downloadable file with object URL {object_url} found.")

        metadata_df = _get_metadata_df(file_record.trial_id)

        # Apply the transformations and get derivative data for visualization.
        for transform_name, transform in _get_transforms().items():
            vis_json = transform(file_record, metadata_df)
            if vis_json:
                # Add the vis config to the file_record
                setattr(file_record, transform_name, vis_json)

        # Save the derivative data additions to the database.
        session.commit()
Esempio n. 12
0
def _derive_files_from_upload(trial_id: str, upload_type: str, session):
    # Get trial metadata JSON for the associated trial
    trial_record: TrialMetadata = TrialMetadata.find_by_trial_id(
        trial_id, session=session)

    # Run the file derivation
    derivation_context = unprism.DeriveFilesContext(trial_record.metadata_json,
                                                    upload_type,
                                                    fetch_artifact)
    derivation_result = unprism.derive_files(derivation_context)

    # TODO: consider parallelizing this step if necessary
    for artifact in derivation_result.artifacts:
        # Save to GCS
        blob = upload_to_data_bucket(artifact.object_url, artifact.data)

        # Build basic facet group
        facet_group = f"{artifact.data_format}|{artifact.file_type}"

        # Save to database
        df_record = DownloadableFiles.create_from_blob(
            trial_record.trial_id,
            artifact.file_type,
            artifact.data_format,
            facet_group,
            blob,
            session=session,
            alert_artifact_upload=True,
        )
        df_record.additional_metadata = artifact.metadata
        # Assume that a derived file will be directly useful for data analysis
        df_record.analysis_friendly = True

    # Update the trial metadata blob (in case the file derivation modified it)
    trial_record.metadata_json = derivation_result.trial_metadata

    session.commit()
def test_list_trials(cidc_api, clean_db, monkeypatch):
    """Check that listing trials works as expected"""
    mock_gcloud_client(monkeypatch)
    user_id = setup_user(cidc_api, monkeypatch)
    trial_1, trial_2 = setup_trial_metadata(cidc_api, user_id)

    client = cidc_api.test_client()

    # A CIMAC user can list trials that they're allowed to see via
    # granular permissions
    res = client.get("/trial_metadata")
    assert res.status_code == 200
    assert len(res.json["_items"]) == 1
    assert res.json["_items"][0]["id"] == trial_1
    assert "file_bundle" not in res.json["_items"][0]
    assert "num_participants" not in res.json["_items"][0]
    assert "num_samples" not in res.json["_items"][0]

    # A CIMAC user with a cross-trial permission can list all trials
    with cidc_api.app_context():
        Permissions(granted_by_user=user_id,
                    granted_to_user=user_id,
                    upload_type="ihc").insert()
    res = client.get("/trial_metadata")
    assert res.status_code == 200
    assert len(res.json["_items"]) == 2

    # Allowed users can get all trials
    for role in trial_modifier_roles:
        make_role(user_id, role, cidc_api)

        res = client.get("/trial_metadata")
        assert res.status_code == 200
        assert len(res.json["_items"]) == 2
        assert res.json["_meta"]["total"] == 2
        assert set([t["id"]
                    for t in res.json["_items"]]) == set([trial_1, trial_2])
        assert not any("file_bundle" in t for t in res.json["_items"])

    # Passing the URL param include_file_bundles=true works on an
    # as-available basis - if trials have no files associated with them,
    # they won't have a file bundle in the response
    res = client.get("/trial_metadata?include_file_bundles=true")
    assert res.status_code == 200
    assert len(res.json["_items"]) == 2
    assert "file_bundle" not in res.json["_items"][0]

    # Add some files...
    with cidc_api.app_context():
        # for trial 1
        for id, (type, facet_group) in enumerate([
            ("cytof_10021_9204", "/cytof_10021_9204/spike_in.fcs"),
            ("cytof_10021_9204", "/cytof_10021_9204/source_.fcs"),
            (
                "cytof_10021_9204",
                "/cytof_analysis/combined_cell_counts_profiling.csv",
            ),
            ("wes", "/wes/r1_L.fastq.gz"),
        ]):
            DownloadableFiles(
                id=id,
                trial_id="test-trial-1",
                facet_group=facet_group,
                object_url=f"test-trial-1/{facet_group}",
                upload_type=type,
                file_size_bytes=0,
                uploaded_timestamp=datetime.now(),
            ).insert()
        # for trial 2
        for id_minus_4, (type, facet_group) in enumerate([
            ("participants info", "csv|participants info"),
            ("mif", "/mif/roi_/cell_seg_data.txt"),
        ]):
            DownloadableFiles(
                id=id_minus_4 + 4,
                trial_id="test-trial-2",
                facet_group=facet_group,
                object_url=f"test-trial-2/{facet_group}",
                upload_type=type,
                file_size_bytes=0,
                uploaded_timestamp=datetime.now(),
            ).insert()

    # Listing trials with populated file bundles (also, check that sorting and counting participants/samples works)
    res = client.get(
        "/trial_metadata?include_file_bundles=true&include_counts=true&sort_field=trial_id&sort_direction=asc"
    )
    assert res.status_code == 200
    assert len(res.json["_items"]) == 2
    assert res.json["_items"][0]
    [trial_json_1, trial_json_2] = res.json["_items"]
    assert set(trial_json_1["file_bundle"]["CyTOF"]["source"]) == set([0, 1])
    assert trial_json_1["file_bundle"]["CyTOF"]["analysis"] == [2]
    assert trial_json_1["file_bundle"]["WES"]["source"] == [3]
    assert trial_json_1["num_samples"] == 1
    assert trial_json_1["num_participants"] == 1
    assert trial_json_2["file_bundle"]["Participants Info"]["clinical"] == [4]
    assert trial_json_2["file_bundle"]["mIF"]["analysis"] == [5]
    assert trial_json_2["num_samples"] == 0
    assert trial_json_2["num_participants"] == 0

    # Filtering by trial id seems to work when file bundles are included
    res = client.get(
        "/trial_metadata?include_file_bundles=true&trial_ids=test-trial-1")
    assert res.status_code == 200
    assert len(res.json["_items"]) == 1
    assert res.json["_items"][0]["trial_id"] == "test-trial-1"

    # Pagination seems to work when file bundles are included
    res = client.get("/trial_metadata?include_file_bundles=true&page_size=1")
    assert res.status_code == 200
    assert len(res.json["_items"]) == 1

    # Metadata blobs are pruned as expected
    res = client.get("/trial_metadata")
    assert res.status_code == 200
    metadata_json = res.json["_items"][0]["metadata_json"]
    assert metadata_json.get("participants") is None
    assert metadata_json.get("assays") is None
    assert metadata_json.get("analysis") is None
    assert metadata_json.get("shipments") is None
def test_create_compressed_batch(cidc_api, clean_db, monkeypatch):
    user_id = setup_user(cidc_api, monkeypatch)
    file_id_1, file_id_2 = setup_downloadable_files(cidc_api)
    with cidc_api.app_context():
        url_1 = DownloadableFiles.find_by_id(file_id_1).object_url
        url_2 = DownloadableFiles.find_by_id(file_id_2).object_url

    client = cidc_api.test_client()

    url = "/downloadable_files/compressed_batch"

    # A JSON body containing a file ID list must be provided
    res = client.post(url)
    assert res.status_code == 422

    # User has no permissions, so no files should be found
    short_file_list = {"file_ids": [file_id_1, file_id_2]}
    res = client.post(url, json=short_file_list)
    assert res.status_code == 404

    # Give the user one permission
    with cidc_api.app_context():
        perm = Permissions(
            granted_to_user=user_id,
            trial_id=trial_id_1,
            upload_type=upload_types[0],
            granted_by_user=user_id,
        )
        perm.insert()

    # Mock GCS client
    blob = MagicMock()
    bucket = MagicMock()
    bucket.blob.return_value = blob
    monkeypatch.setattr(
        "cidc_api.resources.downloadable_files.gcloud_client._get_bucket",
        lambda _: bucket,
    )
    signed_url = "fake/signed/url"
    monkeypatch.setattr(
        "cidc_api.resources.downloadable_files.gcloud_client.get_signed_url",
        lambda *_: signed_url,
    )

    # User has one permission, s0 the endpoint should try to create
    # a compressed batch file with the single file the user has
    # access to in it.
    res = client.post(url, json=short_file_list)
    assert res.status_code == 200
    assert res.json == signed_url
    print(bucket.get_blob.call_args_list)
    bucket.get_blob.assert_called_with(url_1)
    blob.upload_from_filename.assert_called_once()

    bucket.reset_mock()
    blob.reset_mock()

    make_admin(user_id, cidc_api)

    # Admin has access to both files, but together they are too large
    res = client.post(url, json=short_file_list)
    assert res.status_code == 400
    assert "batch too large" in res.json["_error"]["message"]
    bucket.get_blob.assert_not_called()
    blob.upload_from_filename.assert_not_called()

    # Decrease the size of one of the files and try again
    with cidc_api.app_context():
        df = DownloadableFiles.find_by_id(file_id_1)
        df.file_size_bytes = 1
        df.update()

    res = client.post(url, json=short_file_list)
    assert res.status_code == 200
    assert res.json == signed_url
    assert call(url_1) in bucket.get_blob.call_args_list
    assert call(url_2) in bucket.get_blob.call_args_list
    blob.upload_from_filename.assert_called_once()
def test_get_filelist(cidc_api, clean_db, monkeypatch):
    """Check that getting a filelist.tsv works as expected"""
    user_id = setup_user(cidc_api, monkeypatch)
    file_id_1, file_id_2 = setup_downloadable_files(cidc_api)

    client = cidc_api.test_client()

    url = "/downloadable_files/filelist"

    # A JSON body containing a file ID list must be provided
    res = client.post(url)
    assert res.status_code == 422

    # User has no permissions, so no files should be found
    short_file_list = {"file_ids": [file_id_1, file_id_2]}
    res = client.post(url, json=short_file_list)
    assert res.status_code == 404

    # Give the user one permission
    with cidc_api.app_context():
        perm = Permissions(
            granted_to_user=user_id,
            trial_id=trial_id_1,
            upload_type=upload_types[0],
            granted_by_user=user_id,
        )
        perm.insert()

    # User has one permission, so the filelist should contain a single file
    res = client.post(url, json=short_file_list)
    assert res.status_code == 200
    assert "text/tsv" in res.headers["Content-Type"]
    assert "filename=filelist.tsv" in res.headers["Content-Disposition"]
    assert res.data.decode("utf-8") == (
        f"gs://{GOOGLE_DATA_BUCKET}/{trial_id_1}/wes/.../reads_123.bam\t{trial_id_1}_wes_..._reads_123.bam\n"
    )

    # Admins don't need permissions to get files
    make_admin(user_id, cidc_api)
    res = client.post(url, json=short_file_list)
    assert res.status_code == 200
    assert res.data.decode("utf-8") == (
        f"gs://{GOOGLE_DATA_BUCKET}/{trial_id_1}/wes/.../reads_123.bam\t{trial_id_1}_wes_..._reads_123.bam\n"
        f"gs://{GOOGLE_DATA_BUCKET}/{trial_id_2}/cytof/.../analysis.zip\t{trial_id_2}_cytof_..._analysis.zip\n"
    )

    # Clear inserted file records
    with cidc_api.app_context():
        clean_db.query(DownloadableFiles).delete()

    # Filelists don't get paginated
    ids = []
    with cidc_api.app_context():
        for id in range(1000):
            df = DownloadableFiles(
                trial_id=trial_id_1,
                object_url=str(id),
                upload_type="",
                file_size_bytes=0,
                uploaded_timestamp=datetime.now(),
            )
            df.insert()
            ids.append(df.id)

    res = client.post(url, json={"file_ids": ids})
    assert res.status_code == 200
    # newly inserted files + EOF newline
    assert len(res.data.decode("utf-8").split("\n")) == len(ids) + 1
Esempio n. 16
0
def test_create_downloadable_file_from_blob(clean_db, monkeypatch):
    """Try to create a downloadable file from a GCS blob"""
    fake_blob = MagicMock()
    fake_blob.name = "name"
    fake_blob.md5_hash = "12345"
    fake_blob.crc32c = "54321"
    fake_blob.size = 5
    fake_blob.time_created = datetime.now()

    clean_db.add(
        TrialMetadata(
            trial_id="id",
            metadata_json={
                "protocol_identifier": "id",
                "allowed_collection_event_names": [],
                "allowed_cohort_names": [],
                "participants": [],
            },
        )
    )
    df = DownloadableFiles.create_from_blob(
        "id", "pbmc", "Shipping Manifest", "pbmc/shipping", fake_blob
    )

    # Mock artifact upload publishing
    publisher = MagicMock()
    monkeypatch.setattr("cidc_api.models.models.publish_artifact_upload", publisher)

    # Check that the file was created
    assert 1 == clean_db.query(DownloadableFiles).count()
    df_lookup = DownloadableFiles.find_by_id(df.id)
    assert df_lookup.object_url == fake_blob.name
    assert df_lookup.data_format == "Shipping Manifest"
    assert df_lookup.file_size_bytes == fake_blob.size
    assert df_lookup.md5_hash == fake_blob.md5_hash
    assert df_lookup.crc32c_hash == fake_blob.crc32c

    # uploading second time to check non duplicating entries
    fake_blob.size = 6
    fake_blob.md5_hash = "6"
    df = DownloadableFiles.create_from_blob(
        "id", "pbmc", "Shipping Manifest", "pbmc/shipping", fake_blob
    )

    # Check that the file was created
    assert 1 == clean_db.query(DownloadableFiles).count()
    df_lookup = DownloadableFiles.find_by_id(df.id)
    assert df_lookup.file_size_bytes == 6
    assert df_lookup.md5_hash == "6"

    # Check that no artifact upload event was published
    publisher.assert_not_called()

    # Check that artifact upload publishes
    DownloadableFiles.create_from_blob(
        "id",
        "pbmc",
        "Shipping Manifest",
        "pbmc/shipping",
        fake_blob,
        alert_artifact_upload=True,
    )
    publisher.assert_called_once_with(fake_blob.name)
Esempio n. 17
0
def test_trial_metadata_get_summaries(clean_db, monkeypatch):
    """Check that trial data summaries are computed as expected"""
    # Add some trials
    records = [{"fake": "record"}]
    cytof_record_with_output = [{"output_files": {"foo": "bar"}}]
    tm1 = {
        **METADATA,
        # deliberately override METADATA['protocol_identifier']
        "protocol_identifier": "tm1",
        "participants": [{"samples": [1, 2]}, {"samples": [3]}],
        "expected_assays": ["ihc", "olink"],
        "assays": {
            "wes": [
                {"records": records * 6},
                {"records": records * 5},
            ],  # 6 + 5 11 = 7 for wes + 4 for wes_tumor_only
            "rna": [{"records": records * 2}],
            "mif": [
                {"records": records * 3},
                {"records": records},
                {"records": records},
            ],
            "elisa": [{"assay_xlsx": {"number_of_samples": 7}}],
            "nanostring": [
                {"runs": [{"samples": records * 2}]},
                {"runs": [{"samples": records * 1}]},
            ],
            "hande": [{"records": records * 5}],
        },
        "analysis": {
            "wes_analysis": {
                "pair_runs": [
                    # 7 here for wes_assay: t0/1/2, n0/1/2/3
                    {
                        "tumor": {"cimac_id": "t0"},
                        "normal": {"cimac_id": "n0"},
                    },  # no analysis data
                    {
                        "tumor": {"cimac_id": "t1"},
                        "normal": {"cimac_id": "n1"},
                        "report": {"report": "foo"},
                    },
                    {
                        "tumor": {"cimac_id": "t1"},
                        "normal": {"cimac_id": "n2"},
                        "report": {"report": "foo"},
                    },
                    {
                        "tumor": {"cimac_id": "t2"},
                        "normal": {"cimac_id": "n3"},
                        "report": {"report": "foo"},
                    },
                ],
                # these are excluded, so not adding fake assay data
                "excluded_samples": records * 2,
            },
            "wes_tumor_only_analysis": {
                "runs": records * 4,  # need 4
                # these are excluded, so not adding fake assay data
                "excluded_samples": records * 3,
            },
        },
        "clinical_data": {
            "records": [
                {"clinical_file": {"participants": ["a", "b", "c"]}},
                {"clinical_file": {"participants": ["a", "b", "d"]}},
                {"clinical_file": {"participants": ["e", "f", "g"]}},
            ]
        },
    }
    tm2 = {
        **METADATA,
        # deliberately override METADATA['protocol_identifier']
        "protocol_identifier": "tm1",
        "participants": [{"samples": []}],
        "assays": {
            "cytof_10021_9204": [
                {
                    "records": cytof_record_with_output * 2,
                    "excluded_samples": records * 2,
                },
                {"records": records * 2},
                {"records": records},
            ],
            "cytof_e4412": [
                {
                    "participants": [
                        {"samples": records},
                        {"samples": cytof_record_with_output * 5},
                        {"samples": records * 2},
                    ],
                    "excluded_samples": records,
                }
            ],
            "olink": {
                "batches": [
                    {
                        "records": [
                            {"files": {"assay_npx": {"number_of_samples": 2}}},
                            {"files": {"assay_npx": {"number_of_samples": 3}}},
                        ]
                    },
                    {"records": [{"files": {"assay_npx": {"number_of_samples": 3}}}]},
                ]
            },
        },
        "analysis": {
            "rna_analysis": {"level_1": records * 10, "excluded_samples": records * 2},
            "tcr_analysis": {
                "batches": [
                    {"records": records * 4, "excluded_samples": records * 3},
                    {"records": records * 2, "excluded_samples": records * 1},
                ]
            },
        },
    }
    TrialMetadata(trial_id="tm1", metadata_json=tm1).insert(validate_metadata=False)
    TrialMetadata(trial_id="tm2", metadata_json=tm2).insert(validate_metadata=False)

    # Add some files
    for i, (tid, fs) in enumerate([("tm1", 3), ("tm1", 2), ("tm2", 4), ("tm2", 6)]):
        DownloadableFiles(
            trial_id=tid,
            file_size_bytes=fs,
            object_url=str(i),
            facet_group="",
            uploaded_timestamp=datetime.now(),
            upload_type="",
        ).insert()

    sorter = lambda s: s["trial_id"]
    received = sorted(TrialMetadata.get_summaries(), key=sorter)
    expected = sorted(
        [
            {
                "expected_assays": [],
                "cytof": 13.0,
                "olink": 8.0,
                "trial_id": "tm2",
                "file_size_bytes": 10,
                "total_participants": 1,
                "total_samples": 0,
                "clinical_participants": 0.0,
                "rna": 0.0,
                "nanostring": 0.0,
                "elisa": 0.0,
                "h&e": 0.0,
                "mif": 0.0,
                "cytof_analysis": 7.0,
                "rna_level1_analysis": 10.0,
                "tcr_analysis": 6.0,
                "wes_analysis": 0.0,
                "wes_tumor_only_analysis": 0.0,
                "wes": 0.0,
                "wes_tumor_only": 0.0,
                "excluded_samples": {
                    "tcr_analysis": records * 4,
                    "rna_level1_analysis": records * 2,
                    "cytof_analysis": records * 3,
                },
            },
            {
                "expected_assays": ["ihc", "olink"],
                "elisa": 7.0,
                "cytof": 0.0,
                "olink": 0.0,
                "trial_id": "tm1",
                "file_size_bytes": 5,
                "total_participants": 2,
                "total_samples": 3,
                "clinical_participants": 7.0,
                "rna": 2.0,
                "nanostring": 3.0,
                "h&e": 5.0,
                "mif": 5.0,
                "cytof_analysis": 0.0,
                "rna_level1_analysis": 0.0,
                "tcr_analysis": 0.0,
                "wes_analysis": 5.0,
                "wes_tumor_only_analysis": 4.0,
                "wes": 7.0,
                "wes_tumor_only": 4.0,
                "excluded_samples": {
                    "wes_analysis": records * 2,
                    "wes_tumor_only_analysis": records * 3,
                },
            },
        ],
        key=sorter,
    )
    assert received == expected
    assert all("misc_data" not in entry for entry in received)
Esempio n. 18
0
def ingest_upload(event: dict, context: BackgroundContext):
    """
    When a successful upload event is published, move the data associated
    with the upload job into the download bucket and merge the upload metadata
    into the appropriate clinical trial JSON.
    """
    storage_client = storage.Client()

    job_id = int(extract_pubsub_data(event))

    logger.info(f"ingest_upload execution started on upload job id {job_id}")

    with sqlalchemy_session() as session:
        job: UploadJobs = UploadJobs.find_by_id(job_id, session=session)

        # Check ingestion pre-conditions
        if not job:
            raise Exception(f"No assay upload job with id {job_id} found.")
        if UploadJobStatus(job.status) != UploadJobStatus.UPLOAD_COMPLETED:
            raise Exception(
                f"Received ID for job with status {job.status}. Aborting ingestion."
            )
        trial_id = job.metadata_patch.get(prism.PROTOCOL_ID_FIELD_NAME)
        if not trial_id:
            # We should never hit this, since metadata should be pre-validated.
            with saved_failure_status(job, session):
                raise Exception(
                    f"Invalid assay metadata: missing protocol identifier ({prism.PROTOCOL_ID_FIELD_NAME})."
                )

        logger.info(
            f"Found completed upload job (job_id={job_id}) with uploader {job.uploader_email}"
        )

        url_bundles = [
            URLBundle(*bundle) for bundle in job.upload_uris_with_data_uris_with_uuids()
        ]

        # Copy GCS blobs in parallel
        logger.info("Copying artifacts from upload bucket to data bucket.")
        with ThreadPoolExecutor(THREADPOOL_THREADS) as executor, saved_failure_status(
            job, session
        ):
            destination_objects = executor.map(
                lambda url_bundle: _gcs_copy(
                    storage_client,
                    GOOGLE_UPLOAD_BUCKET,
                    url_bundle.upload_url,
                    GOOGLE_DATA_BUCKET,
                    url_bundle.target_url,
                ),
                url_bundles,
            )

        metadata_patch = job.metadata_patch
        logger.info("Adding artifact metadata to metadata patch.")
        metadata_patch, downloadable_files = TrialMetadata.merge_gcs_artifacts(
            metadata_patch,
            job.upload_type,
            zip([ub.artifact_uuid for ub in url_bundles], destination_objects),
        )

        # Add metadata for this upload to the database
        logger.info(
            "Merging metadata from upload %d into trial %s: " % (job.id, trial_id),
            metadata_patch,
        )
        with saved_failure_status(job, session):
            trial = TrialMetadata.patch_assays(
                trial_id, metadata_patch, session=session
            )

        # Save downloadable files to the database
        # NOTE: this needs to happen after TrialMetadata.patch_assays
        # in order to avoid violating a foreign-key constraint on the trial_id
        # in the event that this is the first upload for a trial.
        logger.info("Saving artifact records to the downloadable_files table.")
        for artifact_metadata, additional_metadata in downloadable_files:
            logger.debug(
                f"Saving metadata to downloadable_files table: {artifact_metadata}"
            )
            DownloadableFiles.create_from_metadata(
                trial_id,
                job.upload_type,
                artifact_metadata,
                additional_metadata=additional_metadata,
                session=session,
                commit=False,
            )

        # Additionally, make the metadata xlsx a downloadable file
        with saved_failure_status(job, session):
            _, xlsx_blob = _get_bucket_and_blob(
                storage_client, GOOGLE_DATA_BUCKET, job.gcs_xlsx_uri
            )
            full_uri = f"gs://{GOOGLE_DATA_BUCKET}/{xlsx_blob.name}"
            data_format = "Assay Metadata"
            facet_group = f"{job.upload_type}|{data_format}"
            logger.info(f"Saving {full_uri} as a downloadable_file.")
            DownloadableFiles.create_from_blob(
                trial_id,
                job.upload_type,
                data_format,
                facet_group,
                xlsx_blob,
                session=session,
            )

        # Update the job metadata to include artifacts
        job.metadata_patch = metadata_patch

        # Making files downloadable by a specified biofx analysis team group
        assay_prefix = job.upload_type.split("_")[0]  # 'wes_bam' -> 'wes'
        if assay_prefix in GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT:
            analysis_group_email = GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT[assay_prefix]
            _gcs_add_prefix_reader_permission(
                storage_client,
                analysis_group_email,  # to whom give access to
                f"{trial_id}/{assay_prefix}",  # to what sub-folder
            )

        # Save the upload success and trigger email alert if transaction succeeds
        job.ingestion_success(trial, session=session, send_email=True, commit=True)

        # Trigger post-processing on uploaded data files
        logger.info(f"Publishing object URLs to 'artifact_upload' topic")
        with ThreadPoolExecutor(THREADPOOL_THREADS) as executor:
            executor.map(
                lambda url_bundle: publish_artifact_upload(url_bundle.target_url),
                url_bundles,
            )

        # Trigger post-processing on entire upload
        report = _encode_and_publish(str(job.id), GOOGLE_ASSAY_OR_ANALYSIS_UPLOAD_TOPIC)
        if report:
            report.result()

    # Google won't actually do anything with this response; it's
    # provided for testing purposes only.
    return jsonify(
        dict((bundle.upload_url, bundle.target_url) for bundle in url_bundles)
    )