Ejemplo n.º 1
0
def property_is_selected(stream, property_name):
    md_map = metadata.to_map(stream.metadata)
    return singer.should_sync_field(
        metadata.get(md_map, ("properties", property_name), "inclusion"),
        metadata.get(md_map, ("properties", property_name), "selected"),
        True,
    )
Ejemplo n.º 2
0
    def _get_selected_properties(self, catalog_entry):
        mdata = metadata.to_map(catalog_entry['metadata'])
        properties = catalog_entry['schema'].get('properties', {})

        return [k for k in properties.keys()
                if singer.should_sync_field(metadata.get(mdata, ('properties', k), 'inclusion'),
                                            metadata.get(mdata, ('properties', k), 'selected'),
                                            self.select_fields_by_default)]
Ejemplo n.º 3
0
    def _get_selected_properties(self, catalog_entry):
        mdata = metadata.to_map(catalog_entry["metadata"])
        properties = catalog_entry["schema"].get("properties", {})

        return [
            k for k in properties.keys() if singer.should_sync_field(
                metadata.get(mdata, ("properties", k), "inclusion"),
                metadata.get(mdata, ("properties", k), "selected"),
                self.select_fields_by_default,
            )
        ]
Ejemplo n.º 4
0
def selected_fields(catalog_for_stream):
    mdata = metadata.to_map(catalog_for_stream.metadata)
    fields = catalog_for_stream.schema.properties.keys()

    selected_fields_list = list()
    for field in fields:
        field_metadata = mdata.get(('properties', field))
        if should_sync_field(field_metadata.get('inclusion'), field_metadata.get('selected')):
            selected_fields_list.append(field)

    return selected_fields_list
Ejemplo n.º 5
0
def property_is_selected(stream, property_name):
    """Detect if field is selected to sync"""
    md_map = metadata.to_map(stream.metadata)
    return singer.should_sync_field(
        metadata.get(md_map, ('properties', property_name), 'inclusion'),
        metadata.get(md_map, ('properties', property_name), 'selected'), True)
Ejemplo n.º 6
0
def should_sync_column(md_map, field_name):
    field_metadata = md_map.get(('properties', field_name), {})
    return singer.should_sync_field(field_metadata.get('inclusion'),
                                    field_metadata.get('selected'), True)
Ejemplo n.º 7
0
def sync(client, config, catalog, state):
    start_date = config.get('start_date')

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams or selected_streams == []:
        return

    # Get current datetime (now_dt_str) for query parameters
    now_dttm = utils.now()
    now_dt_str = strftime(now_dttm)[0:10]
    # Reference: https://support.google.com/webmasters/answer/96568?hl=en
    # There is some delay/lag in Google Search Console results reconcilliation
    attribution_start_dttm = now_dttm - timedelta(days=ATTRIBUTION_DAYS)

    # Loop through selected_streams
    for stream_name in selected_streams:
        LOGGER.info('STARTED Syncing: {}'.format(stream_name))
        update_currently_syncing(state, stream_name)
        write_schema(catalog, stream_name)
        endpoint_config = STREAMS[stream_name]
        bookmark_field = next(
            iter(endpoint_config.get('replication_keys', [])), None)
        body_params = endpoint_config.get('body', {})
        endpoint_total = 0
        # Initialize body
        body = endpoint_config.get('body', {})
        # Loop through sites from config site_urls
        site_list = []
        if 'site_urls' in config:
            site_list = config['site_urls'].replace(" ", "").split(",")
        for site in site_list:
            # Skip/ignore sitemaps for domain property sites
            # Reference issue: https://github.com/googleapis/google-api-php-client/issues/1607
            #   "...sitemaps API does not support domain property urls at this time."
            if stream_name == 'sitemaps' and site[0:9] == 'sc-domain':
                LOGGER.info('Skipping Site: {}'.format(site))
                LOGGER.info(
                    '  Sitemaps API does not support domain property urls at this time.'
                )

            else:  # Not sitemaps and sites = sc-domain
                LOGGER.info('STARTED Syncing: {}, Site: {}'.format(
                    stream_name, site))
                site_total = 0
                site_encoded = quote(site, safe='')
                path = endpoint_config.get('path').format(site_encoded)
                resource_name = endpoint_config.get('resource_name')

                # Set dimension_list for performance_reports
                if stream_name == 'performance_report_custom':
                    dimensions_list = []
                    # Create dimensions_list from catalog breadcrumb
                    stream = catalog.get_stream(stream_name)
                    mdata = metadata.to_map(stream.metadata)
                    dimensions_all = [
                        'date', 'country', 'device', 'page', 'query'
                    ]
                    for dim in dimensions_all:
                        if singer.should_sync_field(
                                singer.metadata.get(mdata, ('properties', dim),
                                                    'inclusion'),
                                singer.metadata.get(mdata, ('properties', dim),
                                                    'selected')):
                            # metadata is selected for the dimension
                            dimensions_list.append(dim)
                    body_params['dimensions'] = dimensions_list
                dimensions_list = body_params.get('dimensions')
                LOGGER.info('stream: {}, dimensions_list: {}'.format(
                    stream_name, dimensions_list))

                # loop through each sub type
                sub_types = endpoint_config.get('sub_types', ['self'])
                for sub_type in sub_types:
                    sub_type_total = 0

                    # Initialize date window
                    if stream_name.startswith('performance_report'):
                        reports_dttm_str = get_bookmark(
                            state, stream_name, site, sub_type, start_date)

                        reports_dttm = strptime_to_utc(reports_dttm_str)
                        if reports_dttm < attribution_start_dttm:
                            start_dttm = reports_dttm
                        else:
                            start_dttm = attribution_start_dttm
                        end_dttm = start_dttm + timedelta(
                            days=DATE_WINDOW_SIZE)
                        if end_dttm > now_dttm:
                            end_dttm = now_dttm

                    else:
                        start_dttm = strptime_to_utc(start_date)
                        end_dttm = now_dttm

                    # Date window loop
                    while start_dttm < now_dttm:
                        start_str = strftime(start_dttm)[0:10]
                        end_str = strftime(end_dttm)[0:10]
                        if stream_name.startswith('performance_report'):
                            body = {
                                'searchType': sub_type,
                                'startDate': start_str,
                                'endDate': end_str,
                                **body_params
                            }
                        else:
                            body = None

                        LOGGER.info(
                            'START Syncing Stream: {}, Site: {}, Type: {}, {} to {}'
                            .format(stream_name, site, sub_type, start_str,
                                    end_str))
                        total_records = sync_endpoint(
                            client=client,
                            catalog=catalog,
                            state=state,
                            start_date=start_date,
                            stream_name=stream_name,
                            site=site,
                            sub_type=sub_type,
                            dimensions_list=dimensions_list,
                            path=path,
                            endpoint_config=endpoint_config,
                            api_method=endpoint_config.get(
                                'api_method', 'GET'),
                            pagination=endpoint_config.get(
                                'pagination', 'none'),
                            static_params=endpoint_config.get('params', {}),
                            bookmark_field=bookmark_field,
                            data_key=endpoint_config.get('data_key', None),
                            body_params=body,
                            id_fields=endpoint_config.get('key_properties'),
                            resource_name=resource_name)

                        # Increment totals
                        endpoint_total = endpoint_total + total_records
                        site_total = site_total + total_records
                        sub_type_total = sub_type_total + total_records

                        LOGGER.info(
                            'FINISHED Syncing Stream: {}, Site: {}, Type: {}, {} to {}'
                            .format(stream_name, site, sub_type, start_str,
                                    end_str))
                        LOGGER.info(
                            '  Records Synced for Date Window: {}'.format(
                                total_records))

                        # Set next date window
                        start_dttm = end_dttm
                        end_dttm = start_dttm + timedelta(
                            days=DATE_WINDOW_SIZE)
                        if end_dttm > now_dttm:
                            end_dttm = now_dttm
                        # End date window loop

                    LOGGER.info(
                        'FINISHED Syncing Stream: {}, Site: {}, Type: {}'.
                        format(stream_name, site, sub_type))
                    LOGGER.info(
                        '  Records Synced for Type: {}'.format(sub_type_total))
                    # End sub-type loop
                # End else: Not sitemaps and sites = sc-domain

                LOGGER.info('FINISHED Syncing Stream: {}, Site: {}'.format(
                    stream_name, site))
                LOGGER.info('  Records Synced for Site: {}'.format(site_total))
                # End site loop

        LOGGER.info('FINISHED Syncing Stream: {}'.format(stream_name))
        LOGGER.info('  Records Synced for Stream: {}'.format(endpoint_total))
        update_currently_syncing(state, None)
Ejemplo n.º 8
0
def _is_selected(catalog_entry):
    return singer.should_sync_field(catalog_entry.get('inclusion'),
                                    catalog_entry.get('selected'),
                                    False)
Ejemplo n.º 9
0
def should_sync_column(md_map, field_name):
    field_metadata = md_map.get(("properties", field_name), {})
    return singer.should_sync_field(
        field_metadata.get("inclusion"), field_metadata.get("selected"), True
    )
Ejemplo n.º 10
0
def _is_selected(catalog_entry):
    mdata = metadata.to_map(catalog_entry['metadata'])
    return singer.should_sync_field(metadata.get(mdata, (), 'inclusion'),
                                    metadata.get(mdata, (), 'selected'),
                                    default=False)