def sync_endpoint( client, #pylint: disable=too-many-branches catalog, state, start_date, stream_name, site, sub_type, dimensions_list, path, endpoint_config, api_method, pagination, static_params, bookmark_field=None, data_key=None, body_params=None, id_fields=None, resource_name=None): # Get the latest bookmark for the stream and set the last_datetime last_datetime = None max_bookmark_value = None last_datetime = get_bookmark(state, stream_name, site, sub_type, start_date) max_bookmark_value = last_datetime # Pagination: loop thru all pages of data # Pagination types: none, body, params # Each page has an offset (starting value) and a limit (batch size, number of records) # Increase the "offset" by the "limit" for each batch. # Continue until the "offset" exceeds the total_records. offset = 0 # Starting offset value for each batch API call limit = endpoint_config.get( 'row_limit', 1000) # Batch size; Number of records per API call total_records = 0 batch_count = limit page = 1 path_params = {'siteUrl': site} while limit == batch_count: if pagination == 'body': body = { 'startRow': offset, 'rowLimit': limit, **body_params # adds in endpoint specific, sort, filter body params } params = static_params elif pagination == 'params': params = { 'startRow': offset, 'rowLimit': limit, **static_params # adds in endpoint specific, sort, filter body params } body = body_params else: params = static_params body = body_params LOGGER.info( 'Stream: {}, Site: {}, Type: {} - Batch Sync start, Offset: {}'. format(stream_name, site, sub_type, offset)) # Squash params to query-string params querystring = None if params.items(): querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) LOGGER.info('URL for Stream: {}, Site: {}, Search type: {}'.format( stream_name, site, sub_type)) if body and not body == {}: LOGGER.info('body = {}'.format(body)) # API request data, endpoint = stream_name passed to client for metrics logging if body: path_params['body'] = body data = client.get(method_name=api_method, resource_name=resource_name, params=path_params) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {}: LOGGER.info('xxx NO DATA xxx') if bookmark_field: write_bookmark(state, stream_name, site, sub_type, max_bookmark_value) return 0 # No data results # Transform data with transform_json from transform.py transformed_data = [] # initialize the record list # Sites endpoint returns a single record dictionary (not a list) if stream_name == 'sites': data_list = [] data_list.append(data) data_dict = {} data_dict[data_key] = data_list data = data_dict if data_key in data: transformed_data = transform_json(data, stream_name, data_key, site, sub_type, dimensions_list)[data_key] else: LOGGER.info('Number of raw data records: 0') if not transformed_data or transformed_data is None: LOGGER.info('xxx NO TRANSFORMED DATA xxx') if bookmark_field: write_bookmark(state, stream_name, site, sub_type, max_bookmark_value) return 0 # No data results for record in transformed_data: for key in id_fields: if not record.get(key): primary_keys_only = { id_field: record.get(id_field) for id_field in id_fields } raise ValueError( 'Missing key {} in record with primary keys {}'.format( key, primary_keys_only)) batch_count = len(transformed_data) # Process records and get the max_bookmark_value and record_count for the set of records max_bookmark_value = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime) # to_rec: to record; ending record for the batch to_rec = offset + limit if to_rec > total_records: to_rec = total_records LOGGER.info( 'Stream: {}, Site: {}, Type: {}, Page: {}, Batch records: {} to {}' .format(stream_name, site, sub_type, page, offset, to_rec)) # Pagination: increment the offset by the limit (batch-size) offset = offset + limit total_records = total_records + batch_count page = page + 1 # Update the state with the max_bookmark_value for the stream, site, sub_type # Reference: https://developers.google.com/webmaster-tools/search-console-api-original/v3/searchanalytics/query # NOTE: Results are sorted by click count descending. # If two rows have the same click count, they are sorted in an arbitrary way. # Records are NOT sorted in DATE order. # THEREFOR: State is updated after ALL pages of data for stream, site, sub_type, date window if bookmark_field: write_bookmark(state, stream_name, site, sub_type, max_bookmark_value) # Return total_records across all batches return total_records
def sync_endpoint( client, #pylint: disable=too-many-branches catalog, state, start_date, stream_name, site, sub_type, dimensions_list, path, endpoint_config, api_method, pagination, static_params, bookmark_field=None, bookmark_type=None, data_key=None, body_params=None, id_fields=None): # Get the latest bookmark for the stream and set the last_integer/datetime last_datetime = None last_integer = None max_bookmark_value = None if bookmark_type == 'integer': last_integer = get_bookmark(state, stream_name, site, sub_type, 0) max_bookmark_value = last_integer else: last_datetime = get_bookmark(state, stream_name, site, sub_type, start_date) max_bookmark_value = last_datetime # Pagination: loop thru all pages of data # Pagination types: none, body, params # Each page has an offset (starting value) and a limit (batch size, number of records) # Increase the "offset" by the "limit" for each batch. # Continue until the "offset" exceeds the total_records. offset = 0 # Starting offset value for each batch API call limit = 25000 # Batch size; Number of records per API call total_records = limit # Initialize total; set to actual total on first API call while offset <= total_records: if pagination == 'body': body = { 'startRow': offset, 'rowLimit': limit, **body_params # adds in endpoint specific, sort, filter body params } params = static_params elif pagination == 'params': params = { 'startRow': offset, 'rowLimit': limit, **static_params # adds in endpoint specific, sort, filter body params } body = body_params else: params = static_params body = body_params LOGGER.info( 'Stream: {}, Site: {}, Type: {} - Batch Sync start, Offset: {}'. format(stream_name, site, sub_type, offset)) # Squash params to query-string params querystring = None if params.items(): querystring = '&'.join( ['%s=%s' % (key, value) for (key, value) in params.items()]) LOGGER.info('URL for Stream: {}, Site: {} ({}): {}/{}{}'.format( stream_name, site, api_method, BASE_URL, path, '?{}'.format(querystring) if querystring else '')) if body and not body == {}: LOGGER.info('body = {}'.format(body)) # API request data, endpoint = stream_name passed to client for metrics logging data = {} if api_method == 'GET': data = client.get(path=path, params=querystring, endpoint=stream_name) elif api_method == 'POST': data = client.post(path=path, params=querystring, endpoint=stream_name, data=json.dumps(body)) # time_extracted: datetime when the data was extracted from the API time_extracted = utils.now() if not data or data is None or data == {}: LOGGER.info('xxx NO DATA xxx') return 0 # No data results # Transform data with transform_json from transform.py transformed_data = [] # initialize the record list # Sites endpoint returns a single record dictionary (not a list) if stream_name == 'sites': data_list = [] data_list.append(data) data_dict = {} data_dict[data_key] = data_list data = data_dict # LOGGER.info('data = {}'.format(data)) # TESTING, comment out if data_key in data: LOGGER.info('Number of raw data records: {}'.format( len(data[data_key]))) transformed_data = transform_json(data, stream_name, data_key, site, sub_type, dimensions_list)[data_key] LOGGER.info('Number of transformed_data records: {}'.format( len(transformed_data))) else: LOGGER.info('Number of raw data records: 0') # LOGGER.info('transformed_data = {}'.format(transformed_data)) # TESTING, comment out if not transformed_data or transformed_data is None: LOGGER.info('xxx NO TRANSFORMED DATA xxx') return 0 # No data results for record in transformed_data: for key in id_fields: if not record.get(key): LOGGER.info('xxx Missing key {} in record: {}'.format( key, record)) # Process records and get the max_bookmark_value and record_count for the set of records max_bookmark_value = process_records( catalog=catalog, stream_name=stream_name, records=transformed_data, time_extracted=time_extracted, bookmark_field=bookmark_field, bookmark_type=bookmark_type, max_bookmark_value=max_bookmark_value, last_datetime=last_datetime, last_integer=last_integer) # set total_records for pagination total_records = offset + len(transformed_data) LOGGER.info('total_records: {}, offset: {}, length: {}'.format( total_records, offset, len(transformed_data))) # Update the state with the max_bookmark_value for the stream, site, sub_type if bookmark_field: write_bookmark(state, stream_name, site, sub_type, max_bookmark_value) # to_rec: to record; ending record for the batch to_rec = offset + limit if to_rec > total_records: to_rec = total_records LOGGER.info( 'Stream: {}, Site: {}, Type: {} - Synced batch records - {} to {}'. format(stream_name, site, sub_type, offset, to_rec)) # Pagination: increment the offset by the limit (batch-size) offset = offset + limit # Return total_records across all batches return total_records