def test_downloadable_files_data_category_prefix(): """Check that data_category_prefix's are derived as expected""" file_w_category = DownloadableFiles(facet_group="/wes/r1_L.fastq.gz") assert file_w_category.data_category_prefix == "WES" file_no_category = DownloadableFiles() assert file_no_category.data_category_prefix == None
def test_upload_manifest_twice(cidc_api, clean_db, monkeypatch): """Ensure that doing upload_manifest twice will produce only one DownloadableFiles""" user_id = setup_trial_and_user(cidc_api, monkeypatch) mocks = UploadMocks(monkeypatch) client = cidc_api.test_client() grant_upload_permission(user_id, "pbmc", cidc_api) make_nci_biobank_user(user_id, cidc_api) res = client.post(MANIFEST_UPLOAD, data=form_data("pbmc.xlsx", io.BytesIO(b"a"), "pbmc")) assert res.status_code == 200 # Check that we tried to publish a patient/sample update mocks.publish_patient_sample_update.assert_called_once() with cidc_api.app_context(): assert not DownloadableFiles.list() # manifest is not stored # uploading second time res = client.post(MANIFEST_UPLOAD, data=form_data("pbmc.xlsx", io.BytesIO(b"b"), "pbmc")) assert res.status_code == 200 assert mocks.upload_xlsx.call_count == 0 # manifest is not stored with cidc_api.app_context(): assert not DownloadableFiles.list() # manifest is not stored
def test_downloadable_files_additional_metadata_default(clean_db): TrialMetadata.create(TRIAL_ID, METADATA) df = DownloadableFiles( trial_id=TRIAL_ID, upload_type="wes_bam", object_url="10021/Patient 1/sample 1/aliquot 1/wes_forward.fastq", file_size_bytes=1, md5_hash="hash1234", uploaded_timestamp=datetime.now(), ) # Check no value passed df.insert() assert df.additional_metadata == {} for nullish_value in [None, "null", {}]: df.additional_metadata = nullish_value df.update() assert df.additional_metadata == {} # Non-nullish value doesn't get overridden non_nullish_value = {"foo": "bar"} df.additional_metadata = non_nullish_value df.update() assert df.additional_metadata == non_nullish_value
def test_create_downloadable_file_from_metadata(db, monkeypatch): """Try to create a downloadable file from artifact_core metadata""" # fake file metadata file_metadata = { "artifact_category": "Assay Artifact from CIMAC", "object_url": "10021/Patient 1/sample 1/aliquot 1/wes_forward.fastq", "file_name": "wes_forward.fastq", "file_size_bytes": 1, "md5_hash": "hash1234", "uploaded_timestamp": datetime.now(), "foo": "bar", # unsupported column - should be filtered } # Create the trial (to avoid violating foreign-key constraint) TrialMetadata.patch_trial_metadata(TRIAL_ID, METADATA) # Create the file DownloadableFiles.create_from_metadata(TRIAL_ID, "wes", file_metadata) # Check that we created the file new_file = (db.query(DownloadableFiles).filter_by( file_name=file_metadata["file_name"]).first()) assert new_file del file_metadata["foo"] for k in file_metadata.keys(): assert getattr(new_file, k) == file_metadata[k]
def test_create_downloadable_file_from_metadata(clean_db, monkeypatch): """Try to create a downloadable file from artifact_core metadata""" # fake file metadata file_metadata = { "object_url": "10021/Patient 1/sample 1/aliquot 1/wes_forward.fastq", "file_size_bytes": 1, "md5_hash": "hash1234", "uploaded_timestamp": datetime.now(), "foo": "bar", # unsupported column - should be filtered } additional_metadata = {"more": "info"} # Mock artifact upload publishing publisher = MagicMock() monkeypatch.setattr("cidc_api.models.models.publish_artifact_upload", publisher) # Create the trial (to avoid violating foreign-key constraint) TrialMetadata.create(TRIAL_ID, METADATA) # Create files with empty or "null" additional metadata for nullish_value in ["null", None, {}]: df = DownloadableFiles.create_from_metadata( TRIAL_ID, "wes_bam", file_metadata, additional_metadata=nullish_value ) clean_db.refresh(df) assert df.additional_metadata == {} # Create the file DownloadableFiles.create_from_metadata( TRIAL_ID, "wes_bam", file_metadata, additional_metadata=additional_metadata ) # Check that we created the file new_file = ( clean_db.query(DownloadableFiles) .filter_by(object_url=file_metadata["object_url"]) .first() ) assert new_file del file_metadata["foo"] for k in file_metadata.keys(): assert getattr(new_file, k) == file_metadata[k] assert new_file.additional_metadata == additional_metadata # Check that no artifact upload event was published publisher.assert_not_called() # Check that artifact upload publishes DownloadableFiles.create_from_metadata( TRIAL_ID, "wes_bam", file_metadata, additional_metadata=additional_metadata, alert_artifact_upload=True, ) publisher.assert_called_once_with(file_metadata["object_url"])
def create_df(facet_group, additional_metadata={}) -> DownloadableFiles: df = DownloadableFiles( facet_group=facet_group, additional_metadata=additional_metadata, trial_id=TRIAL_ID, uploaded_timestamp=datetime.now(), file_size_bytes=0, object_url=facet_group, # just filler, not relevant to the test upload_type="", ) df.insert() clean_db.refresh(df) return df
def test_info_data_overview(cidc_api, clean_db): """Check that the data overview has expected structure and values""" def insert_trial(trial_id, num_participants, num_samples): TrialMetadata( trial_id=trial_id, metadata_json={ prism.PROTOCOL_ID_FIELD_NAME: trial_id, "allowed_cohort_names": [""], "allowed_collection_event_names": [""], "participants": [{ "cimac_participant_id": f"CTTTPP{p}", "participant_id": "x", "samples": [{ "cimac_id": f"CTTTPP1SS.0{s}", "sample_location": "", "type_of_primary_container": "Other", "type_of_sample": "Other", "collection_event_name": "", "parent_sample_id": "", } for s in range(num_samples[p])], } for p in range(num_participants)], }, ).insert() # 3 trials # 15 participants # 40 samples # 3 files with cidc_api.app_context(): insert_trial("1", 6, [0] * 6) insert_trial("2", 4, [5, 6, 7, 8]) insert_trial("3", 5, [3, 2, 1, 1, 7]) for i in range(3): DownloadableFiles( trial_id="1", upload_type="wes", object_url=str(i), facet_group= "/wes/r2_L.fastq.gz", # this is what makes this file "related" uploaded_timestamp=datetime.now(), file_size_bytes=2, ).insert() client = cidc_api.test_client() res = client.get("/info/data_overview") assert res.status_code == 200 assert res.json == { "num_assays": len(prism.SUPPORTED_ASSAYS), "num_trials": 3, "num_participants": 15, "num_samples": 40, "num_files": 3, "num_bytes": 6, }
def make_file(trial_id, object_url, upload_type, facet_group) -> DownloadableFiles: return DownloadableFiles( trial_id=trial_id, upload_type=upload_type, object_url=f"{trial_id}/{object_url}", facet_group=facet_group, uploaded_timestamp=datetime.now(), file_size_bytes=int(51 * 1e6), # 51MB )
def setup_db_records(cidc_api): extra = {"_etag": ETAG} with cidc_api.app_context(): Users(**users["json"], **extra).insert(compute_etag=False) TrialMetadata(**trial_metadata["json"], **extra).insert(compute_etag=False) DownloadableFiles(**downloadable_files["json"], **extra).insert( compute_etag=False ) Permissions(**permissions["json"], **extra).insert(compute_etag=False) UploadJobs(**upload_jobs["json"], **extra).insert(compute_etag=False)
def test_get_related_files(cidc_api, clean_db, monkeypatch): """Check that the related_files endpoint calls `get_related_files`""" user_id = setup_user(cidc_api, monkeypatch) file_id_1, file_id_2 = setup_downloadable_files(cidc_api) client = cidc_api.test_client() # Add an additional file that is related to file 1 object_url = "/foo/bar" with cidc_api.app_context(): DownloadableFiles( trial_id=trial_id_1, upload_type="wes", object_url=object_url, facet_group= "/wes/r2_L.fastq.gz", # this is what makes this file "related" uploaded_timestamp=datetime.now(), file_size_bytes=0, ).insert() # Non-admins get 401s when requesting related files they don't have permission to view res = client.get(f"/downloadable_files/{file_id_1}/related_files") assert res.status_code == 401 # Give the user one permission with cidc_api.app_context(): perm = Permissions( granted_to_user=user_id, trial_id=trial_id_1, upload_type=upload_types[0], granted_by_user=user_id, ) perm.insert() # Non-admins can get related files that they have permision to view res = client.get(f"/downloadable_files/{file_id_1}/related_files") assert res.status_code == 200 assert len(res.json["_items"]) == 1 # file 1 has 1 related file assert res.json["_items"][0]["object_url"] == object_url # Admins can get related files without permissions make_admin(user_id, cidc_api) res = client.get(f"/downloadable_files/{file_id_2}/related_files") assert res.status_code == 200 assert len(res.json["_items"]) == 0 # file 2 has 0 related file
def vis_preprocessing(event: dict, context: BackgroundContext): with sqlalchemy_session() as session: object_url = extract_pubsub_data(event) file_record: DownloadableFiles = DownloadableFiles.get_by_object_url( object_url, session=session) if not file_record: raise Exception( f"No downloadable file with object URL {object_url} found.") metadata_df = _get_metadata_df(file_record.trial_id) # Apply the transformations and get derivative data for visualization. for transform_name, transform in _get_transforms().items(): vis_json = transform(file_record, metadata_df) if vis_json: # Add the vis config to the file_record setattr(file_record, transform_name, vis_json) # Save the derivative data additions to the database. session.commit()
def _derive_files_from_upload(trial_id: str, upload_type: str, session): # Get trial metadata JSON for the associated trial trial_record: TrialMetadata = TrialMetadata.find_by_trial_id( trial_id, session=session) # Run the file derivation derivation_context = unprism.DeriveFilesContext(trial_record.metadata_json, upload_type, fetch_artifact) derivation_result = unprism.derive_files(derivation_context) # TODO: consider parallelizing this step if necessary for artifact in derivation_result.artifacts: # Save to GCS blob = upload_to_data_bucket(artifact.object_url, artifact.data) # Build basic facet group facet_group = f"{artifact.data_format}|{artifact.file_type}" # Save to database df_record = DownloadableFiles.create_from_blob( trial_record.trial_id, artifact.file_type, artifact.data_format, facet_group, blob, session=session, alert_artifact_upload=True, ) df_record.additional_metadata = artifact.metadata # Assume that a derived file will be directly useful for data analysis df_record.analysis_friendly = True # Update the trial metadata blob (in case the file derivation modified it) trial_record.metadata_json = derivation_result.trial_metadata session.commit()
def test_list_trials(cidc_api, clean_db, monkeypatch): """Check that listing trials works as expected""" mock_gcloud_client(monkeypatch) user_id = setup_user(cidc_api, monkeypatch) trial_1, trial_2 = setup_trial_metadata(cidc_api, user_id) client = cidc_api.test_client() # A CIMAC user can list trials that they're allowed to see via # granular permissions res = client.get("/trial_metadata") assert res.status_code == 200 assert len(res.json["_items"]) == 1 assert res.json["_items"][0]["id"] == trial_1 assert "file_bundle" not in res.json["_items"][0] assert "num_participants" not in res.json["_items"][0] assert "num_samples" not in res.json["_items"][0] # A CIMAC user with a cross-trial permission can list all trials with cidc_api.app_context(): Permissions(granted_by_user=user_id, granted_to_user=user_id, upload_type="ihc").insert() res = client.get("/trial_metadata") assert res.status_code == 200 assert len(res.json["_items"]) == 2 # Allowed users can get all trials for role in trial_modifier_roles: make_role(user_id, role, cidc_api) res = client.get("/trial_metadata") assert res.status_code == 200 assert len(res.json["_items"]) == 2 assert res.json["_meta"]["total"] == 2 assert set([t["id"] for t in res.json["_items"]]) == set([trial_1, trial_2]) assert not any("file_bundle" in t for t in res.json["_items"]) # Passing the URL param include_file_bundles=true works on an # as-available basis - if trials have no files associated with them, # they won't have a file bundle in the response res = client.get("/trial_metadata?include_file_bundles=true") assert res.status_code == 200 assert len(res.json["_items"]) == 2 assert "file_bundle" not in res.json["_items"][0] # Add some files... with cidc_api.app_context(): # for trial 1 for id, (type, facet_group) in enumerate([ ("cytof_10021_9204", "/cytof_10021_9204/spike_in.fcs"), ("cytof_10021_9204", "/cytof_10021_9204/source_.fcs"), ( "cytof_10021_9204", "/cytof_analysis/combined_cell_counts_profiling.csv", ), ("wes", "/wes/r1_L.fastq.gz"), ]): DownloadableFiles( id=id, trial_id="test-trial-1", facet_group=facet_group, object_url=f"test-trial-1/{facet_group}", upload_type=type, file_size_bytes=0, uploaded_timestamp=datetime.now(), ).insert() # for trial 2 for id_minus_4, (type, facet_group) in enumerate([ ("participants info", "csv|participants info"), ("mif", "/mif/roi_/cell_seg_data.txt"), ]): DownloadableFiles( id=id_minus_4 + 4, trial_id="test-trial-2", facet_group=facet_group, object_url=f"test-trial-2/{facet_group}", upload_type=type, file_size_bytes=0, uploaded_timestamp=datetime.now(), ).insert() # Listing trials with populated file bundles (also, check that sorting and counting participants/samples works) res = client.get( "/trial_metadata?include_file_bundles=true&include_counts=true&sort_field=trial_id&sort_direction=asc" ) assert res.status_code == 200 assert len(res.json["_items"]) == 2 assert res.json["_items"][0] [trial_json_1, trial_json_2] = res.json["_items"] assert set(trial_json_1["file_bundle"]["CyTOF"]["source"]) == set([0, 1]) assert trial_json_1["file_bundle"]["CyTOF"]["analysis"] == [2] assert trial_json_1["file_bundle"]["WES"]["source"] == [3] assert trial_json_1["num_samples"] == 1 assert trial_json_1["num_participants"] == 1 assert trial_json_2["file_bundle"]["Participants Info"]["clinical"] == [4] assert trial_json_2["file_bundle"]["mIF"]["analysis"] == [5] assert trial_json_2["num_samples"] == 0 assert trial_json_2["num_participants"] == 0 # Filtering by trial id seems to work when file bundles are included res = client.get( "/trial_metadata?include_file_bundles=true&trial_ids=test-trial-1") assert res.status_code == 200 assert len(res.json["_items"]) == 1 assert res.json["_items"][0]["trial_id"] == "test-trial-1" # Pagination seems to work when file bundles are included res = client.get("/trial_metadata?include_file_bundles=true&page_size=1") assert res.status_code == 200 assert len(res.json["_items"]) == 1 # Metadata blobs are pruned as expected res = client.get("/trial_metadata") assert res.status_code == 200 metadata_json = res.json["_items"][0]["metadata_json"] assert metadata_json.get("participants") is None assert metadata_json.get("assays") is None assert metadata_json.get("analysis") is None assert metadata_json.get("shipments") is None
def test_create_compressed_batch(cidc_api, clean_db, monkeypatch): user_id = setup_user(cidc_api, monkeypatch) file_id_1, file_id_2 = setup_downloadable_files(cidc_api) with cidc_api.app_context(): url_1 = DownloadableFiles.find_by_id(file_id_1).object_url url_2 = DownloadableFiles.find_by_id(file_id_2).object_url client = cidc_api.test_client() url = "/downloadable_files/compressed_batch" # A JSON body containing a file ID list must be provided res = client.post(url) assert res.status_code == 422 # User has no permissions, so no files should be found short_file_list = {"file_ids": [file_id_1, file_id_2]} res = client.post(url, json=short_file_list) assert res.status_code == 404 # Give the user one permission with cidc_api.app_context(): perm = Permissions( granted_to_user=user_id, trial_id=trial_id_1, upload_type=upload_types[0], granted_by_user=user_id, ) perm.insert() # Mock GCS client blob = MagicMock() bucket = MagicMock() bucket.blob.return_value = blob monkeypatch.setattr( "cidc_api.resources.downloadable_files.gcloud_client._get_bucket", lambda _: bucket, ) signed_url = "fake/signed/url" monkeypatch.setattr( "cidc_api.resources.downloadable_files.gcloud_client.get_signed_url", lambda *_: signed_url, ) # User has one permission, s0 the endpoint should try to create # a compressed batch file with the single file the user has # access to in it. res = client.post(url, json=short_file_list) assert res.status_code == 200 assert res.json == signed_url print(bucket.get_blob.call_args_list) bucket.get_blob.assert_called_with(url_1) blob.upload_from_filename.assert_called_once() bucket.reset_mock() blob.reset_mock() make_admin(user_id, cidc_api) # Admin has access to both files, but together they are too large res = client.post(url, json=short_file_list) assert res.status_code == 400 assert "batch too large" in res.json["_error"]["message"] bucket.get_blob.assert_not_called() blob.upload_from_filename.assert_not_called() # Decrease the size of one of the files and try again with cidc_api.app_context(): df = DownloadableFiles.find_by_id(file_id_1) df.file_size_bytes = 1 df.update() res = client.post(url, json=short_file_list) assert res.status_code == 200 assert res.json == signed_url assert call(url_1) in bucket.get_blob.call_args_list assert call(url_2) in bucket.get_blob.call_args_list blob.upload_from_filename.assert_called_once()
def test_get_filelist(cidc_api, clean_db, monkeypatch): """Check that getting a filelist.tsv works as expected""" user_id = setup_user(cidc_api, monkeypatch) file_id_1, file_id_2 = setup_downloadable_files(cidc_api) client = cidc_api.test_client() url = "/downloadable_files/filelist" # A JSON body containing a file ID list must be provided res = client.post(url) assert res.status_code == 422 # User has no permissions, so no files should be found short_file_list = {"file_ids": [file_id_1, file_id_2]} res = client.post(url, json=short_file_list) assert res.status_code == 404 # Give the user one permission with cidc_api.app_context(): perm = Permissions( granted_to_user=user_id, trial_id=trial_id_1, upload_type=upload_types[0], granted_by_user=user_id, ) perm.insert() # User has one permission, so the filelist should contain a single file res = client.post(url, json=short_file_list) assert res.status_code == 200 assert "text/tsv" in res.headers["Content-Type"] assert "filename=filelist.tsv" in res.headers["Content-Disposition"] assert res.data.decode("utf-8") == ( f"gs://{GOOGLE_DATA_BUCKET}/{trial_id_1}/wes/.../reads_123.bam\t{trial_id_1}_wes_..._reads_123.bam\n" ) # Admins don't need permissions to get files make_admin(user_id, cidc_api) res = client.post(url, json=short_file_list) assert res.status_code == 200 assert res.data.decode("utf-8") == ( f"gs://{GOOGLE_DATA_BUCKET}/{trial_id_1}/wes/.../reads_123.bam\t{trial_id_1}_wes_..._reads_123.bam\n" f"gs://{GOOGLE_DATA_BUCKET}/{trial_id_2}/cytof/.../analysis.zip\t{trial_id_2}_cytof_..._analysis.zip\n" ) # Clear inserted file records with cidc_api.app_context(): clean_db.query(DownloadableFiles).delete() # Filelists don't get paginated ids = [] with cidc_api.app_context(): for id in range(1000): df = DownloadableFiles( trial_id=trial_id_1, object_url=str(id), upload_type="", file_size_bytes=0, uploaded_timestamp=datetime.now(), ) df.insert() ids.append(df.id) res = client.post(url, json={"file_ids": ids}) assert res.status_code == 200 # newly inserted files + EOF newline assert len(res.data.decode("utf-8").split("\n")) == len(ids) + 1
def test_create_downloadable_file_from_blob(clean_db, monkeypatch): """Try to create a downloadable file from a GCS blob""" fake_blob = MagicMock() fake_blob.name = "name" fake_blob.md5_hash = "12345" fake_blob.crc32c = "54321" fake_blob.size = 5 fake_blob.time_created = datetime.now() clean_db.add( TrialMetadata( trial_id="id", metadata_json={ "protocol_identifier": "id", "allowed_collection_event_names": [], "allowed_cohort_names": [], "participants": [], }, ) ) df = DownloadableFiles.create_from_blob( "id", "pbmc", "Shipping Manifest", "pbmc/shipping", fake_blob ) # Mock artifact upload publishing publisher = MagicMock() monkeypatch.setattr("cidc_api.models.models.publish_artifact_upload", publisher) # Check that the file was created assert 1 == clean_db.query(DownloadableFiles).count() df_lookup = DownloadableFiles.find_by_id(df.id) assert df_lookup.object_url == fake_blob.name assert df_lookup.data_format == "Shipping Manifest" assert df_lookup.file_size_bytes == fake_blob.size assert df_lookup.md5_hash == fake_blob.md5_hash assert df_lookup.crc32c_hash == fake_blob.crc32c # uploading second time to check non duplicating entries fake_blob.size = 6 fake_blob.md5_hash = "6" df = DownloadableFiles.create_from_blob( "id", "pbmc", "Shipping Manifest", "pbmc/shipping", fake_blob ) # Check that the file was created assert 1 == clean_db.query(DownloadableFiles).count() df_lookup = DownloadableFiles.find_by_id(df.id) assert df_lookup.file_size_bytes == 6 assert df_lookup.md5_hash == "6" # Check that no artifact upload event was published publisher.assert_not_called() # Check that artifact upload publishes DownloadableFiles.create_from_blob( "id", "pbmc", "Shipping Manifest", "pbmc/shipping", fake_blob, alert_artifact_upload=True, ) publisher.assert_called_once_with(fake_blob.name)
def test_trial_metadata_get_summaries(clean_db, monkeypatch): """Check that trial data summaries are computed as expected""" # Add some trials records = [{"fake": "record"}] cytof_record_with_output = [{"output_files": {"foo": "bar"}}] tm1 = { **METADATA, # deliberately override METADATA['protocol_identifier'] "protocol_identifier": "tm1", "participants": [{"samples": [1, 2]}, {"samples": [3]}], "expected_assays": ["ihc", "olink"], "assays": { "wes": [ {"records": records * 6}, {"records": records * 5}, ], # 6 + 5 11 = 7 for wes + 4 for wes_tumor_only "rna": [{"records": records * 2}], "mif": [ {"records": records * 3}, {"records": records}, {"records": records}, ], "elisa": [{"assay_xlsx": {"number_of_samples": 7}}], "nanostring": [ {"runs": [{"samples": records * 2}]}, {"runs": [{"samples": records * 1}]}, ], "hande": [{"records": records * 5}], }, "analysis": { "wes_analysis": { "pair_runs": [ # 7 here for wes_assay: t0/1/2, n0/1/2/3 { "tumor": {"cimac_id": "t0"}, "normal": {"cimac_id": "n0"}, }, # no analysis data { "tumor": {"cimac_id": "t1"}, "normal": {"cimac_id": "n1"}, "report": {"report": "foo"}, }, { "tumor": {"cimac_id": "t1"}, "normal": {"cimac_id": "n2"}, "report": {"report": "foo"}, }, { "tumor": {"cimac_id": "t2"}, "normal": {"cimac_id": "n3"}, "report": {"report": "foo"}, }, ], # these are excluded, so not adding fake assay data "excluded_samples": records * 2, }, "wes_tumor_only_analysis": { "runs": records * 4, # need 4 # these are excluded, so not adding fake assay data "excluded_samples": records * 3, }, }, "clinical_data": { "records": [ {"clinical_file": {"participants": ["a", "b", "c"]}}, {"clinical_file": {"participants": ["a", "b", "d"]}}, {"clinical_file": {"participants": ["e", "f", "g"]}}, ] }, } tm2 = { **METADATA, # deliberately override METADATA['protocol_identifier'] "protocol_identifier": "tm1", "participants": [{"samples": []}], "assays": { "cytof_10021_9204": [ { "records": cytof_record_with_output * 2, "excluded_samples": records * 2, }, {"records": records * 2}, {"records": records}, ], "cytof_e4412": [ { "participants": [ {"samples": records}, {"samples": cytof_record_with_output * 5}, {"samples": records * 2}, ], "excluded_samples": records, } ], "olink": { "batches": [ { "records": [ {"files": {"assay_npx": {"number_of_samples": 2}}}, {"files": {"assay_npx": {"number_of_samples": 3}}}, ] }, {"records": [{"files": {"assay_npx": {"number_of_samples": 3}}}]}, ] }, }, "analysis": { "rna_analysis": {"level_1": records * 10, "excluded_samples": records * 2}, "tcr_analysis": { "batches": [ {"records": records * 4, "excluded_samples": records * 3}, {"records": records * 2, "excluded_samples": records * 1}, ] }, }, } TrialMetadata(trial_id="tm1", metadata_json=tm1).insert(validate_metadata=False) TrialMetadata(trial_id="tm2", metadata_json=tm2).insert(validate_metadata=False) # Add some files for i, (tid, fs) in enumerate([("tm1", 3), ("tm1", 2), ("tm2", 4), ("tm2", 6)]): DownloadableFiles( trial_id=tid, file_size_bytes=fs, object_url=str(i), facet_group="", uploaded_timestamp=datetime.now(), upload_type="", ).insert() sorter = lambda s: s["trial_id"] received = sorted(TrialMetadata.get_summaries(), key=sorter) expected = sorted( [ { "expected_assays": [], "cytof": 13.0, "olink": 8.0, "trial_id": "tm2", "file_size_bytes": 10, "total_participants": 1, "total_samples": 0, "clinical_participants": 0.0, "rna": 0.0, "nanostring": 0.0, "elisa": 0.0, "h&e": 0.0, "mif": 0.0, "cytof_analysis": 7.0, "rna_level1_analysis": 10.0, "tcr_analysis": 6.0, "wes_analysis": 0.0, "wes_tumor_only_analysis": 0.0, "wes": 0.0, "wes_tumor_only": 0.0, "excluded_samples": { "tcr_analysis": records * 4, "rna_level1_analysis": records * 2, "cytof_analysis": records * 3, }, }, { "expected_assays": ["ihc", "olink"], "elisa": 7.0, "cytof": 0.0, "olink": 0.0, "trial_id": "tm1", "file_size_bytes": 5, "total_participants": 2, "total_samples": 3, "clinical_participants": 7.0, "rna": 2.0, "nanostring": 3.0, "h&e": 5.0, "mif": 5.0, "cytof_analysis": 0.0, "rna_level1_analysis": 0.0, "tcr_analysis": 0.0, "wes_analysis": 5.0, "wes_tumor_only_analysis": 4.0, "wes": 7.0, "wes_tumor_only": 4.0, "excluded_samples": { "wes_analysis": records * 2, "wes_tumor_only_analysis": records * 3, }, }, ], key=sorter, ) assert received == expected assert all("misc_data" not in entry for entry in received)
def ingest_upload(event: dict, context: BackgroundContext): """ When a successful upload event is published, move the data associated with the upload job into the download bucket and merge the upload metadata into the appropriate clinical trial JSON. """ storage_client = storage.Client() job_id = int(extract_pubsub_data(event)) logger.info(f"ingest_upload execution started on upload job id {job_id}") with sqlalchemy_session() as session: job: UploadJobs = UploadJobs.find_by_id(job_id, session=session) # Check ingestion pre-conditions if not job: raise Exception(f"No assay upload job with id {job_id} found.") if UploadJobStatus(job.status) != UploadJobStatus.UPLOAD_COMPLETED: raise Exception( f"Received ID for job with status {job.status}. Aborting ingestion." ) trial_id = job.metadata_patch.get(prism.PROTOCOL_ID_FIELD_NAME) if not trial_id: # We should never hit this, since metadata should be pre-validated. with saved_failure_status(job, session): raise Exception( f"Invalid assay metadata: missing protocol identifier ({prism.PROTOCOL_ID_FIELD_NAME})." ) logger.info( f"Found completed upload job (job_id={job_id}) with uploader {job.uploader_email}" ) url_bundles = [ URLBundle(*bundle) for bundle in job.upload_uris_with_data_uris_with_uuids() ] # Copy GCS blobs in parallel logger.info("Copying artifacts from upload bucket to data bucket.") with ThreadPoolExecutor(THREADPOOL_THREADS) as executor, saved_failure_status( job, session ): destination_objects = executor.map( lambda url_bundle: _gcs_copy( storage_client, GOOGLE_UPLOAD_BUCKET, url_bundle.upload_url, GOOGLE_DATA_BUCKET, url_bundle.target_url, ), url_bundles, ) metadata_patch = job.metadata_patch logger.info("Adding artifact metadata to metadata patch.") metadata_patch, downloadable_files = TrialMetadata.merge_gcs_artifacts( metadata_patch, job.upload_type, zip([ub.artifact_uuid for ub in url_bundles], destination_objects), ) # Add metadata for this upload to the database logger.info( "Merging metadata from upload %d into trial %s: " % (job.id, trial_id), metadata_patch, ) with saved_failure_status(job, session): trial = TrialMetadata.patch_assays( trial_id, metadata_patch, session=session ) # Save downloadable files to the database # NOTE: this needs to happen after TrialMetadata.patch_assays # in order to avoid violating a foreign-key constraint on the trial_id # in the event that this is the first upload for a trial. logger.info("Saving artifact records to the downloadable_files table.") for artifact_metadata, additional_metadata in downloadable_files: logger.debug( f"Saving metadata to downloadable_files table: {artifact_metadata}" ) DownloadableFiles.create_from_metadata( trial_id, job.upload_type, artifact_metadata, additional_metadata=additional_metadata, session=session, commit=False, ) # Additionally, make the metadata xlsx a downloadable file with saved_failure_status(job, session): _, xlsx_blob = _get_bucket_and_blob( storage_client, GOOGLE_DATA_BUCKET, job.gcs_xlsx_uri ) full_uri = f"gs://{GOOGLE_DATA_BUCKET}/{xlsx_blob.name}" data_format = "Assay Metadata" facet_group = f"{job.upload_type}|{data_format}" logger.info(f"Saving {full_uri} as a downloadable_file.") DownloadableFiles.create_from_blob( trial_id, job.upload_type, data_format, facet_group, xlsx_blob, session=session, ) # Update the job metadata to include artifacts job.metadata_patch = metadata_patch # Making files downloadable by a specified biofx analysis team group assay_prefix = job.upload_type.split("_")[0] # 'wes_bam' -> 'wes' if assay_prefix in GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT: analysis_group_email = GOOGLE_ANALYSIS_PERMISSIONS_GROUPS_DICT[assay_prefix] _gcs_add_prefix_reader_permission( storage_client, analysis_group_email, # to whom give access to f"{trial_id}/{assay_prefix}", # to what sub-folder ) # Save the upload success and trigger email alert if transaction succeeds job.ingestion_success(trial, session=session, send_email=True, commit=True) # Trigger post-processing on uploaded data files logger.info(f"Publishing object URLs to 'artifact_upload' topic") with ThreadPoolExecutor(THREADPOOL_THREADS) as executor: executor.map( lambda url_bundle: publish_artifact_upload(url_bundle.target_url), url_bundles, ) # Trigger post-processing on entire upload report = _encode_and_publish(str(job.id), GOOGLE_ASSAY_OR_ANALYSIS_UPLOAD_TOPIC) if report: report.result() # Google won't actually do anything with this response; it's # provided for testing purposes only. return jsonify( dict((bundle.upload_url, bundle.target_url) for bundle in url_bundles) )