def populate_dataservice(data): for endpoint, entities in data.items(): for k, v in entities.items(): v["kf_id"] = k v["visible"] = True resp = Session().post(f"{host}/{endpoint}/", json=v) if resp.status_code not in {200, 201}: raise Exception(resp.json())
def do_get(url): if quit: return response = Session().get(url) if response.status_code != 200: raise Exception(response.text) body = response.json() res = body["results"] res["_links"] = body["_links"] return res
def yield_entities_from_filter(host, endpoint, filters, show_progress=False): """ Scrape the dataservice for paginated entities matching the filter params. Note: It's almost always going to be safer to use this than requests.get with search parameters, because you never know when you'll get back more than one page of results for a query. :param host: dataservice base url string (e.g. "http://localhost:5000") :param endpoint: dataservice endpoint string (e.g. "genomic-files") :param filters: dict of filters to winnow results from the dataservice (e.g. {"study_id": "SD_DYPMEHHF", "external_id": "foo"}) :raises Exception: if the dataservice doesn't return status 200 :yields: entities matching the filters """ host = host.strip("/") endpoint = endpoint.strip("/") url = f"{host}/{endpoint}" found_kfids = set() which = {"limit": 100} expected = 0 with tqdm(total=1, disable=not show_progress, leave=False) as pbar: while True: resp = Session().get(url, params={**which, **filters}) if resp.status_code != 200: raise Exception(resp.text) j = resp.json() if j["total"] != expected: n = pbar.n pbar.reset(j["total"]) pbar.update(n) expected = j["total"] res = j["results"] if not res: pbar.close() for entity in res: kfid = entity["kf_id"] if kfid not in found_kfids: found_kfids.add(kfid) pbar.update() yield entity try: for (key, i) in [("after", 1), ("after_uuid", 2)]: which[key] = j["_links"]["next"].split("=")[i].split( "&")[0] except KeyError: break num = len(found_kfids) assert expected == num, f"FOUND {num} ENTITIES BUT EXPECTED {expected}"
def get_accession(self, study_id): resp = Session().get(f"{self.api_url}/studies/{study_id}") if resp.status_code != 200: raise Exception(f"Study {study_id} not found in dataservice") study = resp.json()["results"] if study.get("data_access_authority", "").lower() == "dbgap": return study["external_id"], study["version"] else: raise Exception( f"data_access_authority for study {study_id} is not 'dbGaP'" )
def get_latest_sample_status(phs_id, required_status="released"): """Get the most recently released sample status for a study on dbGaP :param phs_id: First part of study accession identifier, e.g. "phs001138" :type phs_id: string :raises Exception: if no released sample data can be found :return: full released accession id (e.g. phs001138.v3.p2), sample data :rtype: tuple """ tried = {} version = None while True: phs_string = f"{phs_id}.v{version}" if version is not None else phs_id url = ("https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/" f"GetSampleStatus.cgi?study_id={phs_string}&rettype=xml") print(f"Querying dbGaP for study {phs_string}") print(f"Manifest URL -> {url}") data = Session(status_forcelist=(502, 503, 504)).get(url) if data.status_code != 200: tried[phs_string] = f"status {data.status_code}" raise Exception(f"Request for study {phs_id} failed." f" - Tried: {tried}") # try: # xml = DefusedET.tostring(DefusedET.fromstring(data.content)) # except DefusedXmlException as e: # raise Exception( # f"DETECTED UNSAFE XML -- {repr(e)} -- FROM {url}\n" # "SEE: https://github.com/tiran/defusedxml" # ).with_traceback(e.__traceback__) xml = ElementTree.tostring(ElementTree.fromstring(data.content)) data = xmltodict.parse(xml) study = data["DbGap"]["Study"] accession = study["@accession"] status = study["@registration_status"] if (required_status is None) or (status.lower() == required_status.lower()): break else: # try previous version version = int(accession.split(".")[1][1:]) - 1 print(f"Study {accession} is not '{required_status}'. " f"registration_status: {status}") tried[accession] = status return accession, study
def _get_response(self, content, params=None, **kwargs): """API request implementation :param content: What kind of content we're requesting to get or set :param params: additional parameters to send :raises REDCapError: REDCap returned an error status :return: REDCap server requests.Response object """ all_params = { "token": self.api_token, "content": content, "format": "json", "returnFormat": "json", } all_params.update(params or {}) if "data" in all_params and not isinstance(all_params["data"], str): all_params["data"] = json.dumps(all_params["data"]) all_params = {k: v for k, v in all_params.items() if v is not None} resp = Session(status_forcelist=(502, 503, 504)).post( self.api, data=all_params, **kwargs ) if resp.status_code != 200: raise REDCapError(f"HTTP {resp.status_code} - {resp.text}") return resp
def delete(u): return Session().delete(u)
def test_sample_status(requests_mock): requests_mock._real_http = True mock_dbgap(requests_mock) study_id, data, expected_patches = load_data() # No study should raise a study not found exception clear_study(study_id) with pytest.raises(Exception) as e: ConsentProcessor(host).get_patches_for_study(study_id) assert f"{study_id} not found" in str(e.value) populate_dataservice(data) # Everything exists: patches should come back as expected patches, alerts = ConsentProcessor(host).get_patches_for_study(study_id) assert not alerts compare(patches, expected_patches) # A visible GF has controlled_access set to null Session().patch( f"{host}/genomic-files/GF_22222222", json={"controlled_access": None} ) patches, alerts = ConsentProcessor(host).get_patches_for_study(study_id) assert alerts == [ "ALERT: GF GF_22222222 is visible but has controlled_access set to null" " instead of True/False." ] # A hidden GF with controlled_access set to null gets empty acl # (harder to test because local dataservice doesn't store acl) Session().patch( f"{host}/genomic-files/GF_22222222", json={"visible": False} ) patches, alerts = ConsentProcessor(host).get_patches_for_study(study_id) assert "GF_22222222" not in patches["genomic-files"] # A biospecimen is missing: patches should be absent relevant parts + alert Session().delete(f"{host}/biospecimens/BS_22222222") patches, alerts = ConsentProcessor(host).get_patches_for_study(study_id) new_expected_patches = { endpoint: { k: v for k, v in entities.items() if not k.endswith("22222222") } for endpoint, entities in expected_patches.items() } compare(patches, new_expected_patches) assert ( data["biospecimens"]["BS_22222222"]["external_sample_id"] in alerts[0] ) # An extra biospecimen: it should be hidden Session().post( f"{host}/biospecimens", json={ "participant_id": "PT_11111111", "external_sample_id": "test_sample_4", "sequencing_center_id": "SC_11111111", "analyte_type": "DNA", "kf_id": "BS_44444444", "consent_type": "LOL", "dbgap_consent_code": "phs999999.c1", }, ) patches, alerts = ConsentProcessor(host).get_patches_for_study(study_id) new_expected_patches["biospecimens"]["BS_44444444"] = { "visible": False, "consent_type": None, "dbgap_consent_code": None, } compare(patches, new_expected_patches)
def clear_study(study_id): Session().delete(f"{host}/studies/{study_id}")
def do_patch(url, patch): msg = f"Patched {url} with {patch}" resp = Session().patch(url, json=patch) if not resp.ok: raise Exception(f"{resp.status_code} -- {msg} -- {resp.json()}") return msg