def request_from_api(api_params, curr_index): """ Make a POST API request and return response (if valid). :param api_params: api params set in yaml config :param curr_index: current API poll start position :return: response object """ err_list = [] try: request_params = { 'from': curr_index, 'size': api_params['BATCH_SIZE'], 'expand': api_params['EXPAND_FIELD_GROUPS'] } # retrieve and parse a "page" (batch) of case objects res = requests.post(url=api_params['ENDPOINT'], data=request_params) # return response body if request was successful if res.status_code == requests.codes.ok: return res restart_idx = curr_index err_list.append('API request returned status code {}.'.format(str(res.status_code))) if api_params['IO_MODE'] == 'a': err_list.append( 'Scripts is being run in "append" mode. ' 'To resume without data loss or duplication, set START_INDEX = {} in your YAML config file.' .format(restart_idx)) except requests.exceptions.MissingSchema as e: err_list.append(str(e) + '(Hint: check the ENDPOINT value supplied in yaml config.)') has_fatal_error(err_list, res.raise_for_status())
def retrieve_and_output_cases(batch_size, endpoint, expand_fields): start_time = time.time() # for benchmarking total_cases_count = 0 is_last_page = False curr_index = 0 keys = set() while not is_last_page: res = request_from_api(curr_index, batch_size, expand_fields, endpoint) res_json = res.json()['data'] cases_json = res_json['hits'] # Currently, if response doesn't contain this metadata, # it indicates an invalid response or request. if 'pagination' not in res_json: has_fatal_error("'pagination' not found in API response, exiting.") batch_record_count = res_json['pagination']['count'] total_cases_count = res_json['pagination']['total'] curr_page = res_json['pagination']['page'] last_page = res_json['pagination']['pages'] for case in cases_json: if 'days_to_index' in case: print("Found days_to_index!\n{}".format(case)) for field in case.copy(): keys.add(field) if curr_page == last_page: is_last_page = True print("API call {}".format(curr_page)) curr_index += batch_record_count
def request_from_api(start_index, batch_size, expand_fields, endpoint): request_params = { 'from': start_index, 'size': batch_size, 'expand': expand_fields } # retrieve and parse a "page" (batch) of case objects res = requests.post(url=endpoint, data=request_params) # return response body if request was successful if res.status_code == requests.codes.ok: return res else: has_fatal_error("API request returned result code {}, exiting.".format( res.status_code))
def retrieve_and_output_cases(api_params, bq_params, data_fp): """ Retrieves case records from API and outputs them to a JSONL file, which is later used to populate the clinical data BQ table. :param api_params: API and file output params, from YAML config :param bq_params: BQ params, from YAML config :param data_fp: absolute path to data output file """ start_time = time.time() # for benchmarking total_cases_count = 0 is_last_page = False with open(data_fp, api_params['IO_MODE']) as json_output_file: curr_index = api_params['START_INDEX'] while not is_last_page: res = request_from_api(api_params, curr_index) res_json = res.json()['data'] cases_json = res_json['hits'] # Currently, if response doesn't contain this metadata, it indicates an invalid response or request. if 'pagination' in res_json: batch_record_count = res_json['pagination']['count'] total_cases_count = res_json['pagination']['total'] curr_page = res_json['pagination']['page'] last_page = res_json['pagination']['pages'] else: has_fatal_error("'pagination' key not found in response json, exiting.", KeyError) for case in cases_json: case_copy = case.copy() for field in api_params['EXCLUDE_FIELDS'].split(','): if field in case_copy: case.pop(field) no_list_value_case = arrays_to_str_list(case) # writing in jsonlines format, as required by BQ json.dump(obj=no_list_value_case, fp=json_output_file) json_output_file.write('\n') if curr_page == last_page or (api_params['MAX_PAGES'] and curr_page == api_params['MAX_PAGES']): is_last_page = True print("Inserted page {} of {} ({} records) into jsonlines file" .format(curr_page, last_page, batch_record_count)) curr_index += batch_record_count
def is_valid_idx_param(yaml_param): """ Verifies that index-type params provided are non-negative integer values. :param yaml_param: value to verify """ e_list = [] try: if int(api_params[yaml_param]) < 0: e_list.append('Invalid value for {} in yaml config (supplied: {}).' .format(yaml_param, type(api_params[yaml_param]))) e_list.append('Value should be a non-negative integer.') has_fatal_error(e_list, ValueError) except TypeError as e: # triggered by casting an inappropriate type to int for testing e_list.append('{} in yaml config should be of type int, not type {}).' .format(yaml_param, type(api_params[yaml_param]))) e_list.append(str(e)) has_fatal_error(e_list, TypeError)
def main(args): if len(args) != 2: has_fatal_error('Usage : {} <configuration_yaml>".format(args[0])', ValueError) # Load the YAML config file with open(args[1], mode='r') as yaml_file: try: api_params, bq_params, steps = load_config(yaml_file, YAML_HEADERS) except ValueError as e: has_fatal_error(str(e), ValueError) # Validate YAML config params validate_params(api_params, bq_params) data_fp = construct_filepath(api_params) schema = None if 'retrieve_and_output_cases' in steps: # Hits the GDC api endpoint, outputs data to jsonl file (newline-delineated json, required by BQ) print('Starting GDC API calls!') retrieve_and_output_cases(api_params, bq_params, data_fp) if 'create_bq_schema_obj' in steps: # Creates a BQ schema python object consisting of nested SchemaField objects print('Creating BQ schema object!') schema = create_bq_schema(api_params, data_fp) if 'build_bq_table' in steps: # Creates and populates BQ table if not schema: has_fatal_error('Empty SchemaField object', UnboundLocalError) print('Building BQ Table!') # don't want the entire fp for 2nd param, just the file name create_and_load_table(bq_params, api_params['DATA_OUTPUT_FILE'], schema)
def validate_params(api_params, bq_params): """ Validates yaml parameters before beginning to execute the script. This checks for reasonable (though not necessarily correct) api request param types and values. It confirms all params are included in specified yaml file. :param api_params: dict of api and file related params from user-provided yaml config :param bq_params: dict of bq related params from user-provided yaml config """ err_list = [] def is_valid_idx_param(yaml_param): """ Verifies that index-type params provided are non-negative integer values. :param yaml_param: value to verify """ e_list = [] try: if int(api_params[yaml_param]) < 0: e_list.append('Invalid value for {} in yaml config (supplied: {}).' .format(yaml_param, type(api_params[yaml_param]))) e_list.append('Value should be a non-negative integer.') has_fatal_error(e_list, ValueError) except TypeError as e: # triggered by casting an inappropriate type to int for testing e_list.append('{} in yaml config should be of type int, not type {}).' .format(yaml_param, type(api_params[yaml_param]))) e_list.append(str(e)) has_fatal_error(e_list, TypeError) try: if api_params['IS_LOCAL_MODE']: yaml_template_path = '../ConfigFiles/ClinicalBQBuild.yaml' else: home = expanduser('~') yaml_template_path = home + '/NextGenETL/ConfigFiles/ClinicalBQBuild.yaml' with open(yaml_template_path, mode='r') as yaml_file: default_api_params, default_bq_params, steps = load_config(yaml_file, YAML_HEADERS) default_api_param_keys = [k for k in default_api_params.keys()] default_bq_param_keys = [k for k in default_bq_params.keys()] # verify all required params exist in yaml config for param in default_api_param_keys: val = api_params[param] for param in default_bq_param_keys: val = bq_params[param] except FileNotFoundError as e: print('Default yaml config file not found, unable to compare with supplied yaml config.\n' + str(e)) except ValueError as e: has_fatal_error(str(e), e) except KeyError as e: has_fatal_error('Missing param from yaml config file.', e) # verify that api index-related params are set to non-negative integers is_valid_idx_param('BATCH_SIZE') and is_valid_idx_param('START_INDEX') and is_valid_idx_param('MAX_PAGES') # BATCH_SIZE must also be positive if api_params['BATCH_SIZE'] == 0: has_fatal_error('BATCH_SIZE set to 0 in yaml_config, should be > 0.', ValueError)