コード例 #1
0
def populate_dataservice(data):
    for endpoint, entities in data.items():
        for k, v in entities.items():
            v["kf_id"] = k
            v["visible"] = True
            resp = Session().post(f"{host}/{endpoint}/", json=v)
            if resp.status_code not in {200, 201}:
                raise Exception(resp.json())
コード例 #2
0
ファイル: scrape.py プロジェクト: kids-first/kf-utils-python
 def do_get(url):
     if quit:
         return
     response = Session().get(url)
     if response.status_code != 200:
         raise Exception(response.text)
     body = response.json()
     res = body["results"]
     res["_links"] = body["_links"]
     return res
コード例 #3
0
ファイル: scrape.py プロジェクト: kids-first/kf-utils-python
def yield_entities_from_filter(host, endpoint, filters, show_progress=False):
    """
    Scrape the dataservice for paginated entities matching the filter params.

    Note: It's almost always going to be safer to use this than requests.get
    with search parameters, because you never know when you'll get back more
    than one page of results for a query.

    :param host: dataservice base url string (e.g. "http://localhost:5000")
    :param endpoint: dataservice endpoint string (e.g. "genomic-files")
    :param filters: dict of filters to winnow results from the dataservice
        (e.g. {"study_id": "SD_DYPMEHHF", "external_id": "foo"})
    :raises Exception: if the dataservice doesn't return status 200
    :yields: entities matching the filters
    """
    host = host.strip("/")
    endpoint = endpoint.strip("/")
    url = f"{host}/{endpoint}"

    found_kfids = set()
    which = {"limit": 100}
    expected = 0
    with tqdm(total=1, disable=not show_progress, leave=False) as pbar:
        while True:
            resp = Session().get(url, params={**which, **filters})

            if resp.status_code != 200:
                raise Exception(resp.text)

            j = resp.json()
            if j["total"] != expected:
                n = pbar.n
                pbar.reset(j["total"])
                pbar.update(n)

            expected = j["total"]
            res = j["results"]

            if not res:
                pbar.close()
            for entity in res:
                kfid = entity["kf_id"]
                if kfid not in found_kfids:
                    found_kfids.add(kfid)
                    pbar.update()
                    yield entity
            try:
                for (key, i) in [("after", 1), ("after_uuid", 2)]:
                    which[key] = j["_links"]["next"].split("=")[i].split(
                        "&")[0]
            except KeyError:
                break

    num = len(found_kfids)
    assert expected == num, f"FOUND {num} ENTITIES BUT EXPECTED {expected}"
コード例 #4
0
    def get_accession(self, study_id):
        resp = Session().get(f"{self.api_url}/studies/{study_id}")
        if resp.status_code != 200:
            raise Exception(f"Study {study_id} not found in dataservice")
        study = resp.json()["results"]

        if study.get("data_access_authority", "").lower() == "dbgap":
            return study["external_id"], study["version"]
        else:
            raise Exception(
                f"data_access_authority for study {study_id} is not 'dbGaP'"
            )
コード例 #5
0
def get_latest_sample_status(phs_id, required_status="released"):
    """Get the most recently released sample status for a study on dbGaP

    :param phs_id: First part of study accession identifier, e.g. "phs001138"
    :type phs_id: string
    :raises Exception: if no released sample data can be found
    :return: full released accession id (e.g. phs001138.v3.p2), sample data
    :rtype: tuple
    """
    tried = {}
    version = None
    while True:
        phs_string = f"{phs_id}.v{version}" if version is not None else phs_id
        url = ("https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/"
               f"GetSampleStatus.cgi?study_id={phs_string}&rettype=xml")

        print(f"Querying dbGaP for study {phs_string}")
        print(f"Manifest URL -> {url}")

        data = Session(status_forcelist=(502, 503, 504)).get(url)
        if data.status_code != 200:
            tried[phs_string] = f"status {data.status_code}"
            raise Exception(f"Request for study {phs_id} failed."
                            f" - Tried: {tried}")

        # try:
        #     xml = DefusedET.tostring(DefusedET.fromstring(data.content))
        # except DefusedXmlException as e:
        #     raise Exception(
        #         f"DETECTED UNSAFE XML -- {repr(e)} -- FROM {url}\n"
        #         "SEE: https://github.com/tiran/defusedxml"
        #     ).with_traceback(e.__traceback__)
        xml = ElementTree.tostring(ElementTree.fromstring(data.content))

        data = xmltodict.parse(xml)
        study = data["DbGap"]["Study"]
        accession = study["@accession"]
        status = study["@registration_status"]

        if (required_status is None) or (status.lower()
                                         == required_status.lower()):
            break
        else:
            # try previous version
            version = int(accession.split(".")[1][1:]) - 1
            print(f"Study {accession} is not '{required_status}'. "
                  f"registration_status: {status}")
            tried[accession] = status

    return accession, study
コード例 #6
0
    def _get_response(self, content, params=None, **kwargs):
        """API request implementation

        :param content: What kind of content we're requesting to get or set
        :param params: additional parameters to send
        :raises REDCapError: REDCap returned an error status
        :return: REDCap server requests.Response object
        """
        all_params = {
            "token": self.api_token,
            "content": content,
            "format": "json",
            "returnFormat": "json",
        }
        all_params.update(params or {})
        if "data" in all_params and not isinstance(all_params["data"], str):
            all_params["data"] = json.dumps(all_params["data"])
        all_params = {k: v for k, v in all_params.items() if v is not None}
        resp = Session(status_forcelist=(502, 503, 504)).post(
            self.api, data=all_params, **kwargs
        )
        if resp.status_code != 200:
            raise REDCapError(f"HTTP {resp.status_code} - {resp.text}")
        return resp
コード例 #7
0
 def delete(u):
     return Session().delete(u)
コード例 #8
0
def test_sample_status(requests_mock):
    requests_mock._real_http = True
    mock_dbgap(requests_mock)

    study_id, data, expected_patches = load_data()

    # No study should raise a study not found exception
    clear_study(study_id)
    with pytest.raises(Exception) as e:
        ConsentProcessor(host).get_patches_for_study(study_id)
    assert f"{study_id} not found" in str(e.value)

    populate_dataservice(data)

    # Everything exists: patches should come back as expected
    patches, alerts = ConsentProcessor(host).get_patches_for_study(study_id)
    assert not alerts
    compare(patches, expected_patches)

    # A visible GF has controlled_access set to null
    Session().patch(
        f"{host}/genomic-files/GF_22222222", json={"controlled_access": None}
    )
    patches, alerts = ConsentProcessor(host).get_patches_for_study(study_id)
    assert alerts == [
        "ALERT: GF GF_22222222 is visible but has controlled_access set to null"
        " instead of True/False."
    ]

    # A hidden GF with controlled_access set to null gets empty acl
    # (harder to test because local dataservice doesn't store acl)
    Session().patch(
        f"{host}/genomic-files/GF_22222222", json={"visible": False}
    )
    patches, alerts = ConsentProcessor(host).get_patches_for_study(study_id)
    assert "GF_22222222" not in patches["genomic-files"]

    # A biospecimen is missing: patches should be absent relevant parts + alert
    Session().delete(f"{host}/biospecimens/BS_22222222")
    patches, alerts = ConsentProcessor(host).get_patches_for_study(study_id)
    new_expected_patches = {
        endpoint: {
            k: v for k, v in entities.items() if not k.endswith("22222222")
        }
        for endpoint, entities in expected_patches.items()
    }
    compare(patches, new_expected_patches)
    assert (
        data["biospecimens"]["BS_22222222"]["external_sample_id"] in alerts[0]
    )

    # An extra biospecimen: it should be hidden
    Session().post(
        f"{host}/biospecimens",
        json={
            "participant_id": "PT_11111111",
            "external_sample_id": "test_sample_4",
            "sequencing_center_id": "SC_11111111",
            "analyte_type": "DNA",
            "kf_id": "BS_44444444",
            "consent_type": "LOL",
            "dbgap_consent_code": "phs999999.c1",
        },
    )
    patches, alerts = ConsentProcessor(host).get_patches_for_study(study_id)
    new_expected_patches["biospecimens"]["BS_44444444"] = {
        "visible": False,
        "consent_type": None,
        "dbgap_consent_code": None,
    }
    compare(patches, new_expected_patches)
コード例 #9
0
def clear_study(study_id):
    Session().delete(f"{host}/studies/{study_id}")
コード例 #10
0
 def do_patch(url, patch):
     msg = f"Patched {url} with {patch}"
     resp = Session().patch(url, json=patch)
     if not resp.ok:
         raise Exception(f"{resp.status_code} -- {msg} -- {resp.json()}")
     return msg