def test_yield_entities_from_filter(dataservice_setup): n = 2 populate_data(n) si = 1 filter = {"study_id": f"SD_{si}1111111"} # Get all participants from one study endpoint = "participants" for ps in [ list(yield_entities_from_filter(DATASERVICE_URL, endpoint, filter)), list(yield_entities(DATASERVICE_URL, endpoint, filter)), ]: assert len(ps) == n for p in ps: assert p["kf_id"].startswith(f"PT_{si}") # Get all biospecimens from one study endpoint = "biospecimens" for bs in [ list(yield_entities_from_filter(DATASERVICE_URL, endpoint, filter)), list(yield_entities(DATASERVICE_URL, endpoint, filter)), ]: assert len(bs) == (n * n) for b in bs: assert b["kf_id"].startswith(f"BS_{si}")
def entities_dict(endpoint, filt): return { e["kf_id"]: e for e in yield_entities( self.api_url, endpoint, filt, True ) }
def find_descendants_by_filter( api_url, endpoint, filter, ignore_gfs_with_hidden_external_contribs, kfids_only=True, db_url=None, ): """ Similar to find_descendants_by_kfids but starts with an API endpoint filter instead of a list of endpoint KFIDs. """ things = list(yield_entities(api_url, endpoint, filter, show_progress=True)) if kfids_only: things = [t["kf_id"] for t in things] descendants = find_descendants_by_kfids( db_url or api_url, endpoint, things, ignore_gfs_with_hidden_external_contribs, kfids_only=kfids_only, ) return descendants
def test_yield_entities_from_kfids(dataservice_setup): n = 2 populate_data(n) kfid_set = {"SD_11111111", "PT_11111111", "BS_11111111"} for es in [ list(yield_entities_from_kfids(DATASERVICE_URL, kfid_set)), list(yield_entities(DATASERVICE_URL, None, kfid_set)), ]: assert len(es) == len(kfid_set) found_kfids = {e["kf_id"] for e in es} assert kfid_set == found_kfids
def merge_s3_and_kf_gfs(ds_url, study_kfid, study_bucket, exclude_s3_keypaths=None): """Return file data from S3 and the Kids First dataservice merged together on external_id to see which S3 files have been loaded into the data service and which loaded files no longer exist. Note: You must be able to query both S3 and the dataservice (VPN + chopaws if running locally) :param study_kfid: Dataservice KFID of the study :type study_kfid: string :param study_bucket: Amazon S3 bucket containing study files :type study_bucket: string :param exclude_s3_keypaths: S3 paths starting with these strings will be excluded, optional, defaults to None :type exclude_s3_keypaths: string, iterable :return: list of dicts """ # Files from the dataservice # We use the API because direct DB queries won't give us the gen3 fields kf = { e["external_id"]: { f"kf_{k.lower()}": v for k, v in e.items() if k not in ["_links", "access_urls", "urls"] } for e in yield_entities( ds_url, "genomic-files", {"study_id": study_kfid}, show_progress=True, ) } # Files from S3 s3 = { "s3://" + o["Bucket"] + "/" + o["Key"]: {f"s3_{k.lower()}": v for k, v in o.items()} for o in fetch_bucket_obj_info( study_bucket, drop_folders=True, ) } # Sadly it's muuuuch harder to exclude paths on the S3 request side because # the S3 API doesn't support it. So we're stuck for now waiting for # potentially thousands of pagination requests that we don't care about, # and then we remove them here. if exclude_s3_keypaths: if isinstance(exclude_s3_keypaths, str): exclude_s3_keypaths = (exclude_s3_keypaths, ) elif exclude_s3_keypaths is not None: exclude_s3_keypaths = tuple(exclude_s3_keypaths) s3 = { k: v for k, v in s3.items() if not v["s3_key"].startswith(exclude_s3_keypaths) } # Merge them together s3kf = defaultdict(dict, s3) for k, v in kf.items(): s3kf[k].update(v) return list(s3kf.values())
def find_descendants_by_kfids( api_or_db_url, parent_endpoint, parents, ignore_gfs_with_hidden_external_contribs, kfids_only=True, ): """ Given a set of KFIDs from a specified endpoint, find the KFIDs of all descendant entities. Given a family kfid, the result will be all participants in that family, all of the participants' biospecimens/outcomes/phenotypes/etc, all of their biospecimens' resultant genomic files, and all of the genomic files' sequencing experiments and read groups. Given a set of genomic file kfids, the result will be just their sequencing experiments and read groups. If you plan to make the discovered descendants visible, you should set ignore_gfs_with_hidden_external_contribs=True so that you don't accidentally unhide a genomic file that has hidden contributing biospecimens. If you plan to make the discovered descendants hidden, you should set ignore_gfs_with_hidden_external_contribs=False so that everything linked to the hidden biospecimens also get hidden. Special performance note: a database connect url will run MUCH faster compared to a dataservice api host :param api_or_db_url: dataservice api host _or_ database connect url e.g. "https://kf-api-dataservice.kidsfirstdrc.org" or "postgres://<USERNAME>:<PASSWORD>@kf-dataservice-postgres-prd.kids-first.io:5432/kfpostgresprd" :param parent_endpoint: endpoint of the starting kfids being passed in :param parents: iterable of starting kfids or entities associated with the parent_endpoint :param ignore_gfs_with_hidden_external_contribs: whether to ignore genomic files (and their descendants) that contain information from hidden biospecimens unrelated to the given parents. :param kfids_only: only return KFIDs, not entire entities :returns: dict mapping endpoints to their sets of discovered kfids """ use_api = api_or_db_url.startswith(("http:", "https:")) if use_api: parent_type = parent_endpoint else: endpoint_to_table = { "studies": "study", "participants": "participant", "family-relationships": "family_relationship", "outcomes": "outcome", "phenotypes": "phenotype", "diagnoses": "diagnosis", "biospecimens": "biospecimen", "families": "family", "biospecimen-genomic-files": "biospecimen_genomic_file", "biospecimen-diagnoses": "biospecimen_diagnosis", "genomic-files": "genomic_file", "read-group-genomic-files": "read_group_genomic_file", "sequencing-experiment-genomic-files": "sequencing_experiment_genomic_file", "read-groups": "read_group", "sequencing-experiments": "sequencing_experiment", } table_to_endpoint = {v: k for k, v in endpoint_to_table.items()} parent_type = endpoint_to_table[parent_endpoint] if use_api: descendancy = _api_descendancy else: descendancy = _db_descendancy db_conn = psycopg2.connect(api_or_db_url) db_cur = db_conn.cursor(cursor_factory=psycopg2.extras.DictCursor) if isinstance(parents, str): parents = [parents] if isinstance(next(iter(parents), None), dict): parent_kfids = set(p["kf_id"] for p in parents) descendants = {parent_type: {p["kf_id"]: p for p in parents}} else: parent_kfids = set(parents) if use_api: descendants = { parent_type: { e["kf_id"]: e for e in yield_entities(api_or_db_url, None, parent_kfids) } } else: query = f"select distinct * from {parent_type} where kf_id in %s" db_cur.execute(query, (tuple(parent_kfids | {None}), )) descendants = { parent_type: {p["kf_id"]: dict(p) for p in db_cur.fetchall()} } done = set() for t in descendancy.keys(): if t != parent_type: done.add(t) else: break def _inner(parent_type, parent_kfids, descendants): if parent_type in done: return done.add(parent_type) for (child_type, link_on_parent, link_on_child) in descendancy.get(parent_type, []): if use_api: with ThreadPoolExecutor() as tpex: futures = [ tpex.submit( _accumulate, yield_entities, api_or_db_url, child_type, {link_on_child: k}, show_progress=True, ) for k in parent_kfids ] children = { e["kf_id"]: e for f in as_completed(futures) for e in f.result() } else: # special case for getting to families from studies if parent_type == "study" and child_type == "family": query = ( "select distinct family.* from family join participant" " on participant.family_id = family.kf_id join study on" " participant.study_id = study.kf_id where study.kf_id " "in %s") else: query = ( f"select distinct {child_type}.* from {child_type} join {parent_type}" f" on {child_type}.{link_on_child} = {parent_type}.{link_on_parent}" f" where {parent_type}.kf_id in %s") db_cur.execute(query, (tuple(parent_kfids | {None}), )) children = {c["kf_id"]: dict(c) for c in db_cur.fetchall()} if children: descendants[child_type] = descendants.get(child_type, dict()) descendants[child_type].update(children) if (child_type == "genomic_file" ) and ignore_gfs_with_hidden_external_contribs: # Ignore multi-specimen genomic files that have hidden # contributing specimens which are not in the descendants extra_contrib_gfs = find_gfs_with_extra_contributors( api_or_db_url, descendants["biospecimen"], descendants["genomic_file"], ) to_remove = (extra_contrib_gfs["hidden"] | extra_contrib_gfs["mixed_visibility"]) descendants["genomic_file"] = { k: v for k, v in descendants["genomic_file"].items() if k not in to_remove } for (child_type, _, _) in descendancy.get(parent_type, []): if descendants.get(child_type): _inner(child_type, descendants[child_type].keys(), descendants) _inner(parent_type, parent_kfids, descendants) if not use_api: descendants = {table_to_endpoint[k]: v for k, v in descendants.items()} if kfids_only: for k, v in descendants.items(): descendants[k] = set(descendants[k]) return descendants