def recursive_execute_fhir_dsl( query: dict, scroll: bool = False, progress: Union[None, tqdm] = None, auth_args: Auth = Auth.shared(), callback: Union[Callable[[Any, bool], None], None] = None, max_pages: Union[int, None] = None, _current_page: int = 1, _scroll_id: str = "true", _prev_hits: List = [], ): will_scroll = query_allows_scrolling(query) and scroll response = execute_single_fhir_dsl( query, scroll_id=_scroll_id if will_scroll else "", retry_backoff=will_scroll, auth_args=auth_args, ) is_first_iteration = _scroll_id == "true" current_results = response.data.get("hits").get("hits") _scroll_id = response.data.get("_scroll_id", "") actual_count = response.data["hits"]["total"]["value"] current_result_count = len(current_results) if is_first_iteration and progress: progress.reset(actual_count) if progress: progress.update(current_result_count) is_last_batch = ((current_result_count == 0) or (scroll is False) or ((max_pages is not None) and (_current_page >= max_pages))) results = [] if callback else [*_prev_hits, *current_results] if callback and not is_last_batch: callback(current_results, False) elif callback and is_last_batch: return callback(current_results, True) elif is_last_batch: suffix = "+" if actual_count == MAX_RESULT_SIZE else "" print(f"Retrieved {len(results)}/{actual_count}{suffix} results") return results return recursive_execute_fhir_dsl( query, scroll=True, progress=progress, auth_args=auth_args, callback=callback, max_pages=max_pages, _current_page=_current_page + 1, _scroll_id=_scroll_id, _prev_hits=results, )
def get_data_frame( cls, name: Optional[str] = None, auth_args: Auth = Auth.shared(), max_pages: Optional[int] = None, page_size: Optional[int] = None, log: bool = False, show_progress: bool = False, ): """Execute a request for projects ## Parameters Query: `phc.easy.projects.ProjectListOptions` Execution: `phc.easy.query.Query.execute_paging_api` """ if page_size is None: # Projects do not have much data so use a higher page size page_size = 100 get_data_frame = super().get_data_frame auth = Auth(auth_args) get_data_frame_args = without_keys( cls._get_current_args(inspect.currentframe(), locals()), ["auth_args", "account", "show_progress"], ) def get_projects_for_account(account: dict): df = get_data_frame( ignore_cache=True, all_results=max_pages is None, auth_args=auth.customized({"account": account["id"]}), show_progress=show_progress, **get_data_frame_args, ) df["account"] = account["id"] return df frame = pd.concat(list(pmap(get_projects_for_account, auth.accounts()))) return frame.reset_index(drop=True)
def get_data_frame(auth_args: Auth = Auth.shared()): auth = Auth(auth_args) client = BaseClient(auth.session()) response = client._api_call( "knowledge/gene-sets", http_verb="GET", params={"datasetId": auth.project_id}, ) frame = pd.DataFrame(response.data["items"]) if "genes" in frame.columns: frame["genes"] = frame.genes.apply( lambda genes: ",".join([d["gene"] for d in genes])) frame = frame.drop(["datasetId"], errors="ignore") return frame
def get( self, record_id: str, auth_args: Auth = Auth.shared(), return_if_not_found=True, ): """Perform a GET on the DSTU3 resource""" auth = Auth(auth_args) client = BaseClient(auth.session()) try: response = client._fhir_call(f"{self.entity}/{record_id}", http_verb="GET").data except ApiError as e: if return_if_not_found and e.response.data == "Not Found": return None raise e return json.loads(response)
def get(cls, id: str, auth_args: Auth = Auth.shared(), **kw_args): results = (super().get_data_frame( id=id, term={ "meta.tag.code.keyword": "PrecisionOCR Service" }, auth_args=auth_args, **kw_args, ).to_dict("records")) return results[0] if len(results) else None
def get_count(cls, query_overrides: dict = {}, auth_args=Auth.shared()): "Get the count for a given FSS query" return Query.find_count_of_dsl_query( { "type": "select", "columns": "*", "from": [{"table": cls.table_name()}], **query_overrides, }, auth_args=auth_args, )
def update( self, record_id: str, update: Callable[[dict], dict], auth_args: Auth = Auth.shared(), ): """Perform an update on the DSTU3 resource through an update function""" data = self.get(record_id, auth_args=auth_args, return_if_not_found=False) return self.put(record_id, update(data), auth_args)
def get_data_frame(cls, all_results=False, auth_args: Auth = Auth.shared(), **kw_args): return super().get_data_frame( term={"meta.tag.code.keyword": "PrecisionOCR Service"}, all_results=all_results, auth_args=auth_args, **{ "ignore_cache": True, **kw_args }, )
def get_data_frame(document_id: str, raw: bool = False, auth_args: Auth = Auth.shared()): auth = Auth(auth_args) document = Document.get(document_id, auth_args=auth_args) file_id = pipe( document.get("content", []), c.filter(lambda c: c.get("format", {}).get("code") == "ocr-text-file-id"), c.first, c.get("attachment", default={}), c.get("url"), iffy(isa(str), lambda url: url.split("/")[-1]), ) if file_id is None: raise ValueError( f"No block file found for document: '{document_id}'") files = Files(auth.session()) filename = files.download(file_id, "/tmp/") frame = pd.read_json(filename, lines=True) os.remove(filename) if raw or len(frame) == 0: return frame return Block.sort( frame.drop(["Geometry"], axis=1).join( pd.json_normalize(frame.Geometry)).pipe( partial( Frame.expand, custom_columns=[ Frame.codeable_like_column_expander("Polygon") ], )).set_index("Id"))
def find(search: str, auth_args: Auth = Auth.shared()): """Search for a project using given criteria and return results as a data frame Attributes ---------- search : str Part of a project's id, name, or description to search for auth_args : Any The authenication to use for the account and project (defaults to shared) """ projects = Project.get_data_frame(auth_args=auth_args) text = projects[SEARCH_COLUMNS].agg(join_strings, axis=1) return projects[text.str.contains(search.lower())]
def test_passing_options_through_to_paging_api(execute_paging_api): execute_paging_api.return_value = pd.DataFrame() auth = Auth() GenomicShortVariant.get_data_frame( [str(uuid4())], raw=True, log=True, auth_args=auth ) kwargs = execute_paging_api.call_args[1] assert kwargs.get("auth_args") == auth assert kwargs.get("log") == True assert kwargs.get("raw") == True
def get_data_frame(search: str = "", auth_args: Auth = Auth.shared()): auth = Auth(auth_args) client = BaseClient(auth.session()) response = client._api_call( "knowledge/genes", http_verb="GET", params={"datasetId": auth.project_id, "gene": search}, ) frame = pd.DataFrame(response.data["items"]) if "alias" in frame.columns: frame["alias"] = frame.alias.apply( lambda aliases: ",".join(aliases) if isinstance(aliases, list) else None ) # We choose to not expand topCancerDrivers and cancerDrivers since it # can easily have 50 values in each. If we really need those, the user # will have to extract those. return frame
def test_skipping_genomic_tests_if_variant_set_ids(get_data_frame): variant_set_ids = [str(uuid4())] test_df = GenomicVariant._get_genomic_tests( variant_set_ids, max_pages=None, all_results=False, auth_args=Auth(), log=False, ) assert_equals(get_data_frame.call_count, 0) assert_equals(len(test_df.columns), 1) assert_equals(list(test_df.id), variant_set_ids)
def set_current(search: str, auth: Auth = Auth.shared()): """Search for a project using given criteria, set it to the authentication object, and return the matching projects as a data frame Attributes ---------- search : str Part of a project's id, name, or description to search for auth : Auth The authenication to update for the account and project (defaults to shared) """ matches = Project.find(search, auth) if len(matches) > 1: print("Multiple projects found. Try a more specific search") elif len(matches) == 0: print(f'No matches found for search "{search}"') else: project = matches.iloc[0] # Uses private method since this is a special case auth.update({"account": project.account, "project_id": project.id}) return matches
def get( cls, id: str, auth_args: Auth = Auth.shared(), query_overrides={}, **kw_args, ): query_overrides = pipe( query_overrides, term_adder({"meta.tag.code.keyword": "PrecisionOCR Service"}), ) return (super().get_data_frame( id=id, auth_args=auth_args, query_overrides=query_overrides, **kw_args, ).to_dict("records")[0])
def get_data_frame( cls, # Query parameters variant_set_ids: List[str] = [], include: List[GenomicVariantInclude] = [], gene: List[str] = [], interpretation: List[str] = [], effect: List[CopyNumberStatus] = [], in_ckb: Optional[bool] = None, # Test parameters patient_id: Optional[str] = None, test_status: Optional[GenomicTestStatus] = GenomicTestStatus.ACTIVE, # Execution parameters, all_results: bool = False, auth_args: Auth = Auth.shared(), max_pages: Optional[int] = None, page_size: Optional[int] = None, log: bool = False, **kw_args, ): """Execute a request for genomic copy number variants ## Parameters Query: `phc.easy.omics.options.genomic_copy_number_variant.GenomicCopyNumberVariantOptions` Execution: `phc.easy.query.Query.execute_paging_api` Expansion: `phc.easy.frame.Frame.expand` """ args = cls._get_current_args(inspect.currentframe(), locals()) return super().get_data_frame( test_type=GenomicTestType.COPY_NUMBER_VARIANT, **{ **kw_args, **args })
def get_data_frame( cls, # Query parameters variant_set_ids: List[str] = [], gene: List[str] = [], effect: List[StructuralType] = [], interpretation: List[str] = [], in_frame: List[InFrame] = [], in_ckb: Optional[bool] = None, include: List[GenomicVariantInclude] = [], # Execution parameters, all_results: bool = False, auth_args: Auth = Auth.shared(), max_pages: Optional[int] = None, page_size: Optional[int] = None, log: bool = False, **kw_args, ): """Execute a request for genomic structural variants ## Parameters Query: `phc.easy.omics.options.genomic_structural_variant.GenomicStructuralVariantOptions` Execution: `phc.easy.query.Query.execute_paging_api` Expansion: `phc.easy.frame.Frame.expand` """ args = cls._get_current_args(inspect.currentframe(), locals()) return super().get_data_frame( test_type=GenomicTestType.STRUCTURAL_VARIANT, **{ **kw_args, **args })
def get_data_frame( cls, # Query parameters variant_set_ids: List[str] = [], include: List[GenomicVariantInclude] = [], gene: List[str] = [], expression: Optional[str] = None, outlier_std_dev: str = None, in_ckb: Optional[bool] = None, order_by: Optional[str] = None, # Execution parameters, all_results: bool = False, auth_args: Auth = Auth.shared(), max_pages: Optional[int] = None, page_size: Optional[int] = None, log: bool = False, **kw_args, ): """Execute a request for genomic expression ## Parameters Query: `phc.easy.omics.options.genomic_expression.GenomicExpressionOptions` Execution: `phc.easy.query.Query.execute_paging_api` Expansion: `phc.easy.frame.Frame.expand` """ args = cls._get_current_args(inspect.currentframe(), locals()) return super().get_data_frame(test_type=GenomicTestType.EXPRESSION, **{ **kw_args, **args })
def get_data_frame( cls, document_id: Optional[str] = None, document_ids: List[str] = [], all_results=False, auth_args: Auth = Auth.shared(), query_overrides={}, **kw_args, ): query_overrides = pipe( query_overrides, term_adder({"meta.tag.code.keyword": "PrecisionOCR Service"}), foreign_ids_adder( foreign_id=document_id, foreign_ids=document_ids, foreign_key="relatesTo.targetReference.reference", foreign_id_prefixes=["DocumentReference/"], ), ) frame = super().get_data_frame( all_results=all_results, auth_args=auth_args, query_overrides=query_overrides, **{ "ignore_cache": True, **kw_args }, ) if PAGE_NUMBER_COLUMN in frame.columns: frame = frame.astype({PAGE_NUMBER_COLUMN: "int"}) if document_id is not None and PAGE_NUMBER_COLUMN in frame.columns: return frame.sort_values(PAGE_NUMBER_COLUMN) return frame
def get_data_frame( cls, all_results: bool = False, raw: bool = False, page_size: Union[int, None] = None, max_pages: Union[int, None] = None, query_overrides: dict = {}, auth_args=Auth.shared(), ignore_cache: bool = False, expand_args: dict = {}, log: bool = False, id: Optional[str] = None, ids: List[str] = [], # Codes code: Optional[Union[str, List[str]]] = None, display: Optional[Union[str, List[str]]] = None, system: Optional[Union[str, List[str]]] = None, code_fields: List[str] = [], ): """Retrieve records Attributes ---------- all_results : bool = False Retrieve sample of results (10) or entire set of records raw : bool = False If raw, then values will not be expanded (useful for manual inspection if something goes wrong) page_size : int The number of records to fetch per page max_pages : int The number of pages to retrieve (useful if working with tons of records) query_overrides : dict = {} Override any part of the elasticsearch FHIR query auth_args : Any The authenication to use for the account and project (defaults to shared) ignore_cache : bool = False Bypass the caching system that auto-saves results to a CSV file. Caching only occurs when all results are being retrieved. expand_args : Any Additional arguments passed to phc.Frame.expand log : bool = False Whether to log some diagnostic statements for debugging id : None or str = None Find records for a given id ids : List[str] Find records for given ids code : str | List[str] Adds where clause for code value(s) display : str | List[str] Adds where clause for code display value(s) system : str | List[str] Adds where clause for code system value(s) code_fields : List[str] A list of paths to find FHIR codes in (default: codes for the given entity) Examples -------- >>> import phc.easy as phc >>> phc.Auth.set({'account': '<your-account-name>'}) >>> phc.Project.set_current('My Project Name') >>> >>> phc.Observation.get_data_frame(patient_id='<patient-id>') >>> >>> phc.Goal.get_data_frame(patient_id='<patient-id>') """ query = { "type": "select", "columns": "*", "from": [{ "table": cls.table_name() }], } code_fields = [*cls.code_fields(), *code_fields] def transform(df: pd.DataFrame): return cls.transform_results(df, **expand_args) return Query.execute_fhir_dsl_with_options( query, transform, all_results, raw, query_overrides, auth_args, ignore_cache, page_size=page_size, max_pages=max_pages, log=log, # Codes code_fields=code_fields, code=code, display=display, system=system, id=id, ids=ids, )
def get_data_frame( cls, test_type: GenomicTestType, # Query parameters variant_set_ids: List[str] = [], # Test parameters patient_id: Optional[str] = None, test_status: Optional[GenomicTestStatus] = GenomicTestStatus.ACTIVE, # Execution parameters, all_results: bool = False, auth_args: Auth = Auth.shared(), max_pages: Optional[int] = None, page_size: Optional[int] = None, log: bool = False, **kw_args, ): """Execute a request for genomic variants ## Parameters Execution: `phc.easy.query.Query.execute_paging_api` Expansion: `phc.easy.frame.Frame.expand` """ test_params = ["patient_id", "status"] test_args, args = split_by( cls._get_current_args(inspect.currentframe(), locals()), left_keys=test_params, ) test_df = cls._get_genomic_tests( variant_set_ids=variant_set_ids, all_results=all_results, test_type=test_type, page_size=page_size, max_pages=max_pages, log=log, auth_args=auth_args, **test_args, ) args["variant_set_ids"] = variant_set_ids = list(test_df.id) if len(variant_set_ids) > MAX_VARIANT_SET_IDS and (max_pages or (not all_results and page_size)): print( "[WARNING]: All result limit parameters are approximate when performing genomic data retrieval." ) get_data_frame = super().get_data_frame def perform_batch(ids: List[str], total_thus_far: int): # Determine whether to skip this batch if ( # Implement approximation of max_pages not all_results and max_pages and (total_thus_far >= max_pages * (page_size or 100))) or ( # Use 25 or page_size for a sample (when no max_pages) not all_results and not max_pages and total_thus_far >= (page_size or 25)): return pd.DataFrame() has_multiple_batches = len(ids) != len(variant_set_ids) return get_data_frame( **kw_args, **{ **args, "variant_set_ids": list(ids), "all_results": all_results # Scroll through full batches and then honor the max_pages param or (has_multiple_batches and max_pages), }, ) variants = batch_get_frame(variant_set_ids, MAX_VARIANT_SET_IDS, perform_batch) if len(variants) == 0: variants["variant_set_id"] = math.nan return variants.join(test_df.set_index("id"), on="variant_set_id", rsuffix=".set")
def _recursive_execute_composite_aggregations( table_name: str, key_sources_pairs: List[Tuple[str, List[dict]]], batch_size: int = 100, progress: Union[tqdm, None] = None, query_overrides: dict = {}, log: bool = False, auth_args: Auth = Auth.shared(), max_pages: Union[int, None] = None, _current_page: int = 1, _prev_results: dict = {}, _after_keys: dict = {}, **query_kwargs, ): aggregation = Query.execute_fhir_dsl( { "type": "select", "columns": [{ "type": "elasticsearch", "aggregations": { key: { "composite": { "sources": sources, "size": batch_size, **({ "after": _after_keys[key] } if key in _after_keys else {}), } } for key, sources in key_sources_pairs if (len(_after_keys) == 0) or (key in _after_keys) }, }], "from": [{ "table": table_name }], **query_overrides, }, auth_args=auth_args, log=log, **query_kwargs, ) current_results = aggregation.data results = FhirAggregation.reduce_composite_results( _prev_results, current_results) if (progress is not None) and (_current_page == 1) and max_pages: progress.reset(max_pages) if progress is not None: # Update by count or pages (if max_pages specified) progress.update(1 if max_pages else FhirAggregation. count_composite_results(current_results)) after_keys = FhirAggregation.find_composite_after_keys( current_results, batch_size) if len(after_keys) == 0 or ((max_pages is not None) and (_current_page >= max_pages)): print( f"Retrieved {FhirAggregation.count_composite_results(results)} results" ) return results return Query._recursive_execute_composite_aggregations( table_name=table_name, key_sources_pairs=key_sources_pairs, batch_size=batch_size, progress=progress, query_overrides=query_overrides, log=log, auth_args=auth_args, max_pages=max_pages, _current_page=_current_page + 1, _prev_results=results, _after_keys=after_keys, **query_kwargs, )
def execute_composite_aggregations( table_name: str, key_sources_pairs: List[Tuple[str, List[dict]]], batch_size: int = 100, query_overrides: dict = {}, log: bool = False, auth_args: Auth = Auth.shared(), max_pages: Union[int, None] = None, **query_kwargs, ): """Count records by multiple fields Attributes ---------- table_name : str The FHIR Search Service table to retrieve from key_sources_pairs : str Pairs of keys and sources to pull composite results from Example Input: [ ("meta.tag", [{"terms": {"field": "meta.tag.system.keyword"}}]) ] batch_size : int The size of each page from elasticsearch to use query_overrides : dict Parts of the FSS query to override (Note that passing certain values can cause the method to error out) Example aggregation query executed (can use log=True to inspect): { "type": "select", "columns": [{ "type": "elasticsearch", "aggregations": { "results": { "composite": { "sources": [{ "meta.tag": { "terms": { "field": "meta.tag.system.keyword" } } }], "size": 100, } } }, }], "from": [{"table": "observation"}], } auth_args : Auth, dict Additional arguments for authentication log : bool = False Whether to log the elasticsearch query sent to the server max_pages : int The number of pages to retrieve (useful if working with tons of records) query_kwargs : dict Arguments to pass to build_query such as patient_id, patient_ids, and patient_key. See :func:`~phc.easy.query.fhir_dsl_query.build_query`. Examples -------- >>> import phc.easy as phc >>> phc.Auth.set({ 'account': '<your-account-name>' }) >>> phc.Project.set_current('My Project Name') >>> phc.Query.execute_composite_aggregations( table_name="observation", key_sources_pairs=[ ("meta.tag", [ {"code": {"terms": {"field": "meta.tag.code.keyword"}}}, ]), ("code.coding", [ {"display": {"terms": {"field": "code.coding.display.keyword"}}} ]), ] ) """ if len(key_sources_pairs) == 0: raise ValueError("No aggregate composite terms specified.") return with_progress( tqdm, lambda progress: Query._recursive_execute_composite_aggregations( table_name=table_name, key_sources_pairs=key_sources_pairs, batch_size=batch_size, progress=progress, log=log, auth_args=auth_args, query_overrides=query_overrides, max_pages=max_pages, **query_kwargs, ), )
def execute_paging_api( path: str, params: dict = {}, http_verb: str = "GET", transform: Callable[[pd.DataFrame], pd.DataFrame] = identity, all_results: bool = False, auth_args: Auth = Auth.shared(), max_pages: Optional[int] = None, page_size: Optional[int] = None, log: bool = False, raw: bool = False, ignore_cache: bool = False, show_progress: bool = True, progress: Optional[tqdm] = None, item_key: str = "items", try_count: bool = True, ): """Execute a API query that pages through results See https://docs.us.lifeomic.com/api/?shell#lifeomic-core-api-genomics for example Attributes ---------- path : str The API path to hit (Special tokens: `:project_id`) params : dict The parameters to include with request http_verb : str The HTTP method to use all_results : bool = False Retrieve sample of results (25) or entire set of records auth_args : Auth, dict Additional arguments for authentication max_pages : int The number of pages to retrieve (useful if working with tons of records) page_size : int The number of records to fetch per page log : bool = False Whether to log some diagnostic statements for debugging progress : Optional[tqdm] = None Override the given progress indicator item_key : str The key to find the results underneath (usually "items" but not always) try_count : bool Whether to try and send a "count" param to update the progress bar Examples -------- >>> import phc.easy as phc >>> phc.Auth.set({ 'account': '<your-account-name>' }) >>> phc.Project.set_current('My Project Name') >>> phc.Query.execute_paging_api( "genomics/projects/:project_id/tests", params={ "patientId": "<patient-uuid>" } ) """ auth = Auth(auth_args) params = clean_params(params) # Do not pull project_id if not in URL (which throws error if project not selected) if "project_id" in path: path = path.replace(":project_id", auth.project_id) query = {"path": path, "method": http_verb, "params": params} if all_results and page_size is None: # Default to 100 if not provided but getting all results page_size = 100 if log: print(json.dumps(query, indent=4)) use_cache = ((not ignore_cache) and (not raw) and all_results and (max_pages is None)) if use_cache and APICache.does_cache_for_query_exist(query): return APICache.load_cache_for_query(query) callback = (APICache.build_cache_callback( query, transform, nested_key=None) if use_cache else None) results = with_progress( lambda: (progress if progress is not None else tqdm()) if show_progress else None, lambda progress: recursive_paging_api_call( path, params=params, http_verb=http_verb, callback=callback, scroll=all_results or (max_pages is not None), max_pages=max_pages, page_size=page_size, log=log, auth_args=auth_args, progress=progress, item_key=item_key, try_count=try_count, ), ) df = pd.DataFrame(results) if raw: return df return transform(df)
def get(auth_args: Auth = Auth.shared()): auth = Auth(auth_args) client = BaseClient(auth.session()) return client._api_call(f"ocr/config/{auth.project_id}", http_verb="GET").data
def recursive_paging_api_call( path: str, params: dict = {}, http_verb: str = "GET", scroll: bool = False, progress: Optional[tqdm] = None, auth_args: Optional[Auth] = Auth.shared(), callback: Union[Callable[[Any, bool], None], None] = None, max_pages: Optional[int] = None, page_size: Optional[int] = None, log: bool = False, _current_page: int = 1, _prev_results: List[dict] = [], _next_page_token: Optional[str] = None, _count: Optional[Union[float, int]] = None, ): auth = Auth(auth_args) client = BaseClient(auth.session()) if _next_page_token: params = {**params, "nextPageToken": _next_page_token} if page_size: params = {**params, "pageSize": page_size} # NOTE: Parallelism is kept with execute_fhir_dsl to unify the API calls if scroll is False: max_pages = 1 # Compute count and add to progress if _count is None and len(_prev_results) == 0: count_response = client._api_call( path, http_verb=http_verb, # Use minimum pageSize in case this endpoint doesn't support count params={ **params, "include": "count", "pageSize": 1 }, ) _count = count_response.get("count") # Count appears to only go up to 999 if _count == 999: print(f"Results are {_count}+.") _count = None if _count and (progress is not None): progress.reset(_count) response = client._api_call(path, http_verb=http_verb, params=params) current_results = response.data.get("items", []) if progress is not None: progress.update(len(current_results)) is_last_batch = ( (scroll is False) or ((max_pages is not None) and (_current_page >= max_pages)) # Using the next link is the only completely reliable way to tell if a # next page exists or (response.data.get("links", {}).get("next") is None)) results = [] if callback else [*_prev_results, *current_results] # Sometimes the count doesn't match the results. We make it sync up if the # count doesn't match but we got all results. # TODO: Remove this when API fixed if ((progress is not None) and scroll and is_last_batch and (progress.total != progress.n)): count = progress.n progress.reset(count) progress.update(count) if callback and not is_last_batch: callback(current_results, False) elif callback and is_last_batch: return callback(current_results, True) elif is_last_batch: if progress is not None: progress.close() # Because count is often wrong, we'll skip the logging here # TODO: Uncomment this when API fixed # print( # f"Retrieved {len(results)}{f'/{_count}' if _count else ''} results" # ) return results return recursive_paging_api_call( path, params=params, http_verb=http_verb, progress=progress, auth_args=auth_args, callback=callback, max_pages=max_pages, page_size=page_size, log=log, scroll=scroll, _current_page=_current_page + 1, _prev_results=results, _next_page_token=get_next_page_token( response.data.get("links", {}).get("next", "")), _count=_count, )
def execute_fhir_dsl( query: dict, all_results: bool = False, auth_args: Auth = Auth.shared(), callback: Union[Callable[[Any, bool], None], None] = None, max_pages: Union[int, None] = None, log: bool = False, **query_kwargs, ): """Execute a FHIR query with the DSL See https://docs.us.lifeomic.com/development/fhir-service/dsl/ Attributes ---------- query : dict The FHIR query to run (is a superset of elasticsearch) all_results : bool Return all results by scrolling through mutliple pages of data (Limit is ignored if provided) auth_args : Auth, dict Additional arguments for authentication callback : Callable[[Any, bool], None] (optional) A progress function that is invoked for each batch. When the second argument passed is true, then the result of the callback function is used as the return value. This is useful if writing results out to a file and then returning the completed result from that file. Example: def handle_batch(batch, is_finished): print(len(batch)) if is_finished: return "batch finished max_pages : int The number of pages to retrieve (useful if working with tons of records) log : bool = False Whether to log the elasticsearch query sent to the server query_kwargs : dict Arguments to pass to build_query such as patient_id, patient_ids, and patient_key. (See phc.easy.query.fhir_dsl_query.build_query) Examples -------- >>> import phc.easy as phc >>> phc.Auth.set({ 'account': '<your-account-name>' }) >>> phc.Project.set_current('My Project Name') >>> phc.Query.execute_fhir_dsl({ "type": "select", "columns": "*", "from": [ {"table": "patient"} ], }, all_results=True) """ query = build_query(query, **query_kwargs) if log: print(json.dumps(query, indent=4)) if FhirAggregation.is_aggregation_query(query): response = execute_single_fhir_dsl(query, auth_args=auth_args) return FhirAggregation.from_response(response) if all_results: return with_progress( lambda: tqdm(total=MAX_RESULT_SIZE), lambda progress: recursive_execute_fhir_dsl( { "limit": [ { "type": "number", "value": 0 }, # Make window size smaller than maximum to reduce # pressure on API { "type": "number", "value": DEFAULT_SCROLL_SIZE }, ], **query, }, scroll=all_results, progress=progress, callback=callback, auth_args=auth_args, max_pages=max_pages, ), ) return recursive_execute_fhir_dsl( query, scroll=all_results, callback=callback, auth_args=auth_args, max_pages=max_pages, )
def get_count_by_field( table_name: str, field: str, batch_size: int = 1000, query_overrides: dict = {}, log: bool = False, auth_args: Auth = Auth.shared(), **query_kwargs, ): """Count records by a given field Attributes ---------- table_name : str The FHIR Search Service table to retrieve from field : str The field name to count the values of (e.g. "subject.reference") batch_size : int The size of each page from elasticsearch to use query_overrides : dict Parts of the FSS query to override (Note that passing certain values can cause the method to error out) The aggregation query is similar to this: { "type": "select", "columns": [{ "type": "elasticsearch", "aggregations": { "results": { "composite": { "sources": [{ "value": { "terms": { "field": "gender.keyword" } } }], "size": 100, } } }, }], "from": [{"table": "patient"}], } auth_args : Auth, dict Additional arguments for authentication log : bool = False Whether to log the elasticsearch query sent to the server query_kwargs : dict Arguments to pass to build_query such as patient_id, patient_ids, and patient_key. (See phc.easy.query.fhir_dsl_query.build_query) Examples -------- >>> import phc.easy as phc >>> phc.Auth.set({ 'account': '<your-account-name>' }) >>> phc.Project.set_current('My Project Name') >>> phc.Query.get_count_by_field( table_name="patient", field="gender" ) """ data = Query.execute_composite_aggregations( table_name=table_name, key_sources_pairs=[( "results", [{ "value": { "terms": { "field": f"{field}.keyword" } } }], )], batch_size=batch_size, log=log, auth_args=auth_args, query_overrides=query_overrides, **query_kwargs, ) return pd.DataFrame([{ field: r["key"]["value"], "doc_count": r["doc_count"] } for r in data["results"]["buckets"]])
def test_getting_genomic_tests(get_data_frame): GenomicVariant._get_genomic_tests( [], max_pages=None, all_results=False, auth_args=Auth(), log=False ) get_data_frame.assert_called_once()
def test_creating_auth_from_another_auth_object(): auth = Auth({"account": "demo"}) auth1 = Auth(auth) assert auth1.account == "demo"