def property_is_selected(stream, property_name): md_map = metadata.to_map(stream.metadata) return singer.should_sync_field( metadata.get(md_map, ("properties", property_name), "inclusion"), metadata.get(md_map, ("properties", property_name), "selected"), True, )
def _get_selected_properties(self, catalog_entry): mdata = metadata.to_map(catalog_entry['metadata']) properties = catalog_entry['schema'].get('properties', {}) return [k for k in properties.keys() if singer.should_sync_field(metadata.get(mdata, ('properties', k), 'inclusion'), metadata.get(mdata, ('properties', k), 'selected'), self.select_fields_by_default)]
def _get_selected_properties(self, catalog_entry): mdata = metadata.to_map(catalog_entry["metadata"]) properties = catalog_entry["schema"].get("properties", {}) return [ k for k in properties.keys() if singer.should_sync_field( metadata.get(mdata, ("properties", k), "inclusion"), metadata.get(mdata, ("properties", k), "selected"), self.select_fields_by_default, ) ]
def selected_fields(catalog_for_stream): mdata = metadata.to_map(catalog_for_stream.metadata) fields = catalog_for_stream.schema.properties.keys() selected_fields_list = list() for field in fields: field_metadata = mdata.get(('properties', field)) if should_sync_field(field_metadata.get('inclusion'), field_metadata.get('selected')): selected_fields_list.append(field) return selected_fields_list
def property_is_selected(stream, property_name): """Detect if field is selected to sync""" md_map = metadata.to_map(stream.metadata) return singer.should_sync_field( metadata.get(md_map, ('properties', property_name), 'inclusion'), metadata.get(md_map, ('properties', property_name), 'selected'), True)
def should_sync_column(md_map, field_name): field_metadata = md_map.get(('properties', field_name), {}) return singer.should_sync_field(field_metadata.get('inclusion'), field_metadata.get('selected'), True)
def sync(client, config, catalog, state): start_date = config.get('start_date') # Get selected_streams from catalog, based on state last_stream # last_stream = Previous currently synced stream, if the load was interrupted last_stream = singer.get_currently_syncing(state) LOGGER.info('last/currently syncing stream: {}'.format(last_stream)) selected_streams = [] for stream in catalog.get_selected_streams(state): selected_streams.append(stream.stream) LOGGER.info('selected_streams: {}'.format(selected_streams)) if not selected_streams or selected_streams == []: return # Get current datetime (now_dt_str) for query parameters now_dttm = utils.now() now_dt_str = strftime(now_dttm)[0:10] # Reference: https://support.google.com/webmasters/answer/96568?hl=en # There is some delay/lag in Google Search Console results reconcilliation attribution_start_dttm = now_dttm - timedelta(days=ATTRIBUTION_DAYS) # Loop through selected_streams for stream_name in selected_streams: LOGGER.info('STARTED Syncing: {}'.format(stream_name)) update_currently_syncing(state, stream_name) write_schema(catalog, stream_name) endpoint_config = STREAMS[stream_name] bookmark_field = next( iter(endpoint_config.get('replication_keys', [])), None) body_params = endpoint_config.get('body', {}) endpoint_total = 0 # Initialize body body = endpoint_config.get('body', {}) # Loop through sites from config site_urls site_list = [] if 'site_urls' in config: site_list = config['site_urls'].replace(" ", "").split(",") for site in site_list: # Skip/ignore sitemaps for domain property sites # Reference issue: https://github.com/googleapis/google-api-php-client/issues/1607 # "...sitemaps API does not support domain property urls at this time." if stream_name == 'sitemaps' and site[0:9] == 'sc-domain': LOGGER.info('Skipping Site: {}'.format(site)) LOGGER.info( ' Sitemaps API does not support domain property urls at this time.' ) else: # Not sitemaps and sites = sc-domain LOGGER.info('STARTED Syncing: {}, Site: {}'.format( stream_name, site)) site_total = 0 site_encoded = quote(site, safe='') path = endpoint_config.get('path').format(site_encoded) resource_name = endpoint_config.get('resource_name') # Set dimension_list for performance_reports if stream_name == 'performance_report_custom': dimensions_list = [] # Create dimensions_list from catalog breadcrumb stream = catalog.get_stream(stream_name) mdata = metadata.to_map(stream.metadata) dimensions_all = [ 'date', 'country', 'device', 'page', 'query' ] for dim in dimensions_all: if singer.should_sync_field( singer.metadata.get(mdata, ('properties', dim), 'inclusion'), singer.metadata.get(mdata, ('properties', dim), 'selected')): # metadata is selected for the dimension dimensions_list.append(dim) body_params['dimensions'] = dimensions_list dimensions_list = body_params.get('dimensions') LOGGER.info('stream: {}, dimensions_list: {}'.format( stream_name, dimensions_list)) # loop through each sub type sub_types = endpoint_config.get('sub_types', ['self']) for sub_type in sub_types: sub_type_total = 0 # Initialize date window if stream_name.startswith('performance_report'): reports_dttm_str = get_bookmark( state, stream_name, site, sub_type, start_date) reports_dttm = strptime_to_utc(reports_dttm_str) if reports_dttm < attribution_start_dttm: start_dttm = reports_dttm else: start_dttm = attribution_start_dttm end_dttm = start_dttm + timedelta( days=DATE_WINDOW_SIZE) if end_dttm > now_dttm: end_dttm = now_dttm else: start_dttm = strptime_to_utc(start_date) end_dttm = now_dttm # Date window loop while start_dttm < now_dttm: start_str = strftime(start_dttm)[0:10] end_str = strftime(end_dttm)[0:10] if stream_name.startswith('performance_report'): body = { 'searchType': sub_type, 'startDate': start_str, 'endDate': end_str, **body_params } else: body = None LOGGER.info( 'START Syncing Stream: {}, Site: {}, Type: {}, {} to {}' .format(stream_name, site, sub_type, start_str, end_str)) total_records = sync_endpoint( client=client, catalog=catalog, state=state, start_date=start_date, stream_name=stream_name, site=site, sub_type=sub_type, dimensions_list=dimensions_list, path=path, endpoint_config=endpoint_config, api_method=endpoint_config.get( 'api_method', 'GET'), pagination=endpoint_config.get( 'pagination', 'none'), static_params=endpoint_config.get('params', {}), bookmark_field=bookmark_field, data_key=endpoint_config.get('data_key', None), body_params=body, id_fields=endpoint_config.get('key_properties'), resource_name=resource_name) # Increment totals endpoint_total = endpoint_total + total_records site_total = site_total + total_records sub_type_total = sub_type_total + total_records LOGGER.info( 'FINISHED Syncing Stream: {}, Site: {}, Type: {}, {} to {}' .format(stream_name, site, sub_type, start_str, end_str)) LOGGER.info( ' Records Synced for Date Window: {}'.format( total_records)) # Set next date window start_dttm = end_dttm end_dttm = start_dttm + timedelta( days=DATE_WINDOW_SIZE) if end_dttm > now_dttm: end_dttm = now_dttm # End date window loop LOGGER.info( 'FINISHED Syncing Stream: {}, Site: {}, Type: {}'. format(stream_name, site, sub_type)) LOGGER.info( ' Records Synced for Type: {}'.format(sub_type_total)) # End sub-type loop # End else: Not sitemaps and sites = sc-domain LOGGER.info('FINISHED Syncing Stream: {}, Site: {}'.format( stream_name, site)) LOGGER.info(' Records Synced for Site: {}'.format(site_total)) # End site loop LOGGER.info('FINISHED Syncing Stream: {}'.format(stream_name)) LOGGER.info(' Records Synced for Stream: {}'.format(endpoint_total)) update_currently_syncing(state, None)
def _is_selected(catalog_entry): return singer.should_sync_field(catalog_entry.get('inclusion'), catalog_entry.get('selected'), False)
def should_sync_column(md_map, field_name): field_metadata = md_map.get(("properties", field_name), {}) return singer.should_sync_field( field_metadata.get("inclusion"), field_metadata.get("selected"), True )
def _is_selected(catalog_entry): mdata = metadata.to_map(catalog_entry['metadata']) return singer.should_sync_field(metadata.get(mdata, (), 'inclusion'), metadata.get(mdata, (), 'selected'), default=False)