Esempio n. 1
0
def sync(client, config, catalog, state):
    start_date = config.get('start_date')

    # Get selected_streams from catalog, based on state last_stream
    #   last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))
    selected_streams = []
    for stream in catalog.get_selected_streams(state):
        selected_streams.append(stream.stream)
    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams or selected_streams == []:
        return

    # Get current datetime (now_dt_str) for query parameters
    now_dttm = utils.now()
    now_dt_str = strftime(now_dttm)[0:10]
    # Reference: https://support.google.com/webmasters/answer/96568?hl=en
    # There is some delay/lag in Google Search Console results reconcilliation
    attribution_start_dttm = now_dttm - timedelta(days=ATTRIBUTION_DAYS)

    # Loop through selected_streams
    for stream_name in selected_streams:
        LOGGER.info('STARTED Syncing: {}'.format(stream_name))
        update_currently_syncing(state, stream_name)
        write_schema(catalog, stream_name)
        endpoint_config = STREAMS[stream_name]
        bookmark_field = next(
            iter(endpoint_config.get('replication_keys', [])), None)
        body_params = endpoint_config.get('body', {})
        endpoint_total = 0
        # Initialize body
        body = endpoint_config.get('body', {})
        # Loop through sites from config site_urls
        site_list = []
        if 'site_urls' in config:
            site_list = config['site_urls'].replace(" ", "").split(",")
        for site in site_list:
            # Skip/ignore sitemaps for domain property sites
            # Reference issue: https://github.com/googleapis/google-api-php-client/issues/1607
            #   "...sitemaps API does not support domain property urls at this time."
            if stream_name == 'sitemaps' and site[0:9] == 'sc-domain':
                LOGGER.info('Skipping Site: {}'.format(site))
                LOGGER.info(
                    '  Sitemaps API does not support domain property urls at this time.'
                )

            else:  # Not sitemaps and sites = sc-domain
                LOGGER.info('STARTED Syncing: {}, Site: {}'.format(
                    stream_name, site))
                site_total = 0
                site_encoded = quote(site, safe='')
                path = endpoint_config.get('path').format(site_encoded)

                # Set dimension_list for performance_reports
                if stream_name == 'performance_report_custom':
                    dimensions_list = []
                    # Create dimensions_list from catalog breadcrumb
                    stream = catalog.get_stream(stream_name)
                    mdata = metadata.to_map(stream.metadata)
                    dimensions_all = [
                        'date', 'country', 'device', 'page', 'query'
                    ]
                    for dim in dimensions_all:
                        if singer.should_sync_field(
                                singer.metadata.get(mdata, ('properties', dim),
                                                    'inclusion'),
                                singer.metadata.get(mdata, ('properties', dim),
                                                    'selected')):
                            # metadata is selected for the dimension
                            dimensions_list.append(dim)
                    body_params['dimensions'] = dimensions_list
                dimensions_list = body_params.get('dimensions')
                LOGGER.info('stream: {}, dimensions_list: {}'.format(
                    stream_name, dimensions_list))

                # loop through each sub type
                sub_types = endpoint_config.get('sub_types', ['self'])
                for sub_type in sub_types:
                    sub_type_total = 0

                    # Initialize date window
                    if stream_name.startswith('performance_report'):
                        reports_dttm_str = get_bookmark(
                            state, stream_name, site, sub_type, start_date)

                        reports_dttm = strptime_to_utc(reports_dttm_str)
                        if reports_dttm < attribution_start_dttm:
                            start_dttm = reports_dttm
                        else:
                            start_dttm = attribution_start_dttm
                        end_dttm = start_dttm + timedelta(
                            days=DATE_WINDOW_SIZE)
                        if end_dttm > now_dttm:
                            end_dttm = now_dttm

                    else:
                        start_dttm = strptime_to_utc(start_date)
                        end_dttm = now_dttm

                    # Date window loop
                    while start_dttm < now_dttm:
                        start_str = strftime(start_dttm)[0:10]
                        end_str = strftime(end_dttm)[0:10]
                        if stream_name.startswith('performance_report'):
                            body = {
                                'searchType': sub_type,
                                'startDate': start_str,
                                'endDate': end_str,
                                **body_params
                            }
                        else:
                            body = None

                        LOGGER.info(
                            'START Syncing Stream: {}, Site: {}, Type: {}, {} to {}'
                            .format(stream_name, site, sub_type, start_str,
                                    end_str))
                        total_records = sync_endpoint(
                            client=client,
                            catalog=catalog,
                            state=state,
                            start_date=start_date,
                            stream_name=stream_name,
                            site=site,
                            sub_type=sub_type,
                            dimensions_list=dimensions_list,
                            path=path,
                            endpoint_config=endpoint_config,
                            api_method=endpoint_config.get(
                                'api_method', 'GET'),
                            pagination=endpoint_config.get(
                                'pagination', 'none'),
                            static_params=endpoint_config.get('params', {}),
                            bookmark_field=bookmark_field,
                            data_key=endpoint_config.get('data_key', None),
                            body_params=body,
                            id_fields=endpoint_config.get('key_properties'))

                        # Increment totals
                        endpoint_total = endpoint_total + total_records
                        site_total = site_total + total_records
                        sub_type_total = sub_type_total + total_records

                        LOGGER.info(
                            'FINISHED Syncing Stream: {}, Site: {}, Type: {}, {} to {}'
                            .format(stream_name, site, sub_type, start_str,
                                    end_str))
                        LOGGER.info(
                            '  Records Synced for Date Window: {}'.format(
                                total_records))

                        # Set next date window
                        start_dttm = end_dttm
                        end_dttm = start_dttm + timedelta(
                            days=DATE_WINDOW_SIZE)
                        if end_dttm > now_dttm:
                            end_dttm = now_dttm
                        # End date window loop

                    LOGGER.info(
                        'FINISHED Syncing Stream: {}, Site: {}, Type: {}'.
                        format(stream_name, site, sub_type))
                    LOGGER.info(
                        '  Records Synced for Type: {}'.format(sub_type_total))
                    # End sub-type loop
                # End else: Not sitemaps and sites = sc-domain

                LOGGER.info('FINISHED Syncing Stream: {}, Site: {}'.format(
                    stream_name, site))
                LOGGER.info('  Records Synced for Site: {}'.format(site_total))
                # End site loop

        LOGGER.info('FINISHED Syncing Stream: {}'.format(stream_name))
        LOGGER.info('  Records Synced for Stream: {}'.format(endpoint_total))
        update_currently_syncing(state, None)
Esempio n. 2
0
 def is_bookmark_old(self, value):
     bookmark = self.get_bookmark()
     return utils.strptime_to_utc(value) >= bookmark
Esempio n. 3
0
def get_current_sync_start(state, tap_stream_id):
    current_sync_start_value = singer.get_bookmark(state, tap_stream_id,
                                                   "current_sync_start")
    if current_sync_start_value is None:
        return current_sync_start_value
    return utils.strptime_to_utc(current_sync_start_value)
Esempio n. 4
0
def string_to_datetime(value):
    try:
        return strftime(strptime_to_utc(value))
    except Exception as ex:
        LOGGER.warning("%s, (%s)", ex, value)
        return None
Esempio n. 5
0
def sync_event_updates(stream_name):
    '''
    Get updates via events endpoint

    look at 'events update' bookmark and pull events after that
    '''
    LOGGER.info("Started syncing event based updates")

    bookmark_value = singer.get_bookmark(Context.state,
                                         stream_name + '_events',
                                         'updates_created') or \
                     int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())
    max_created = bookmark_value
    date_window_start = max_created
    date_window_end = max_created + 604800  # Number of seconds in a week

    stop_paging = False

    # Create a map to hold relate event object ids to timestamps
    updated_object_timestamps = {}

    while not stop_paging:
        extraction_time = singer.utils.now()

        response = STREAM_SDK_OBJECTS['events']['sdk_object'].list(
            **{
                "limit": 100,
                "type": STREAM_TO_TYPE_FILTER[stream_name]['type'],
                "stripe_account": Context.config.get('account_id'),
                # None passed to starting_after appears to retrieve
                # all of them so this should always be safe.
                "created[gte]": date_window_start,
                "created[lt]": date_window_end,
            })

        # If no results, and we are not up to current time
        if not len(response) and date_window_end > extraction_time.timestamp():  # pylint: disable=len-as-condition
            stop_paging = True

        for events_obj in response.auto_paging_iter():
            event_resource_obj = events_obj.data.object
            sub_stream_name = SUB_STREAMS.get(stream_name)

            # Check whether we should sync the event based on its created time
            if not should_sync_event(
                    events_obj, STREAM_TO_TYPE_FILTER[stream_name]['object'],
                    updated_object_timestamps):
                continue

            # Syncing an event as its the first time we've seen it or its the most recent version
            with Transformer(singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING
                             ) as transformer:
                event_resource_metadata = metadata.to_map(
                    Context.get_catalog_entry(stream_name)['metadata'])

                # Filter out line items with null ids
                if isinstance(
                        events_obj.get('data').get('object'), stripe.Invoice):
                    invoice_obj = events_obj.get('data', {}).get('object', {})
                    line_items = invoice_obj.get('lines', {}).get('data')

                    if line_items:
                        filtered_line_items = [
                            line_item for line_item in line_items
                            if line_item.get('id')
                        ]

                        invoice_obj['lines']['data'] = filtered_line_items

                rec = recursive_to_dict(event_resource_obj)
                rec = unwrap_data_objects(rec)
                rec = reduce_foreign_keys(rec, stream_name)
                rec["updated"] = events_obj.created
                rec = transformer.transform(
                    rec,
                    Context.get_catalog_entry(stream_name)['schema'],
                    event_resource_metadata)

                if events_obj.created >= bookmark_value:
                    if rec.get('id') is not None:
                        singer.write_record(stream_name,
                                            rec,
                                            time_extracted=extraction_time)
                        Context.updated_counts[stream_name] += 1

                        # Delete events should be synced but not their subobjects
                        if events_obj.get('type', '').endswith('.deleted'):
                            continue

                        if sub_stream_name and Context.is_selected(
                                sub_stream_name):
                            if event_resource_obj:
                                sync_sub_stream(sub_stream_name,
                                                event_resource_obj,
                                                updates=True)
            if events_obj.created > max_created:
                max_created = events_obj.created

        date_window_start = date_window_end
        date_window_end = date_window_end + 604800
        singer.write_bookmark(Context.state, stream_name + '_events',
                              'updates_created', max_created)
        singer.write_state(Context.state)

    singer.write_state(Context.state)
Esempio n. 6
0
def sync(client, config, catalog, state):
    if 'start_date' in config:
        start_date = config['start_date']
    # LOGGER.info('start_date = {}'.format(start_date))

    # Get datetimes for endpoint parameters
    communications_dttm_str = get_bookmark(state, 'communications', 'self',
                                           start_date)
    communications_dt_str = transform_datetime(communications_dttm_str)[:10]
    # LOGGER.info('communications bookmark_date = {}'.format(communications_dt_str))

    deposit_transactions_dttm_str = get_bookmark(state, 'deposit_transactions',
                                                 'self', start_date)
    deposit_transactions_dt_str = transform_datetime(
        deposit_transactions_dttm_str)[:10]
    # LOGGER.info('deposit_transactions bookmark_date = {}'.format(deposit_transactions_dt_str))

    loan_transactions_dttm_str = get_bookmark(state, 'loan_transactions',
                                              'self', start_date)
    loan_transactions_dt_str = transform_datetime(
        loan_transactions_dttm_str)[:10]
    loan_transactions_dttm = strptime_to_utc(loan_transactions_dt_str)

    clients_dttm_str = get_bookmark(state, 'clients', 'self', start_date)
    clients_dt_str = transform_datetime(clients_dttm_str)[:10]

    groups_dttm_str = get_bookmark(state, 'groups', 'self', start_date)
    groups_dt_str = transform_datetime(groups_dttm_str)[:10]

    lookback_days = int(config.get('lookback_window', LOOKBACK_DEFAULT))
    lookback_date = utils.now() - timedelta(lookback_days)
    if loan_transactions_dttm > lookback_date:
        loan_transactions_dt_str = transform_datetime(
            strftime(lookback_date))[:10]
    # LOGGER.info('loan_transactions bookmark_date = {}'.format(loan_transactions_dt_str))

    # endpoints: API URL endpoints to be called
    # properties:
    #   <root node>: Plural stream name for the endpoint
    #   path: API endpoint relative path, when added to the base URL, creates the full path
    #   api_version: v1 or v2 (default v2).
    #   api_method: GET or POST (default GET).
    #   params: Query, sort, and other endpoint specific parameters
    #   data_key: JSON element containing the records for the endpoint
    #   bookmark_query_field: Typically a date-time field used for filtering the query
    #   bookmark_field: Replication key field, typically a date-time, used for filtering the results
    #        and setting the state
    #   bookmark_type: Data type for bookmark, integer or datetime
    #   id_fields: Primary key (and other IDs) from the Parent stored when store_ids is true.
    #   children: A collection of child endpoints (where the endpoint path includes the parent id)
    #   parent: On each of the children, the singular stream name for parent element
    #   Details Level: https://api.mambu.com/?http#detail-level, FULL includes custom fields

    endpoints = {
        'branches': {
            'path': 'branches',
            'api_version': 'v2',
            'api_method': 'GET',
            'params': {
                'sortBy': 'lastModifiedDate:ASC',
                'detailsLevel': 'FULL',
                'paginationDetails': 'ON'
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id']
        },
        'communications': {
            'path':
            'communications/messages:search',
            'api_version':
            'v2',
            'api_method':
            'POST',
            'params': {
                'detailsLevel': 'FULL'
            },
            'body': [{
                'field': 'state',
                'operator': 'EQUALS',
                'value': 'SENT'
            }, {
                'field': 'creationDate',
                'operator': 'AFTER',
                'value': communications_dt_str
            }],
            'bookmark_field':
            'creation_date',
            'bookmark_type':
            'datetime',
            'id_fields': ['encoded_key']
        },
        'centres': {
            'path': 'centres',
            'api_version': 'v2',
            'api_method': 'GET',
            'params': {
                'sortBy': 'lastModifiedDate:ASC',
                'detailsLevel': 'FULL',
                'paginationDetails': 'ON'
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id']
        },
        'clients': {
            'path': 'clients:search',
            'api_version': 'v2',
            'api_method': 'POST',
            'params': {
                'detailsLevel': 'FULL'
            },
            'body': {
                "sortingCriteria": {
                    "field": "lastModifiedDate",
                    "order": "ASC"
                },
                "filterCriteria": [{
                    "field": "lastModifiedDate",
                    "operator": "AFTER",
                    "value": clients_dt_str
                }]
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id']
        },
        'credit_arrangements': {
            'path': 'creditarrangements',
            'api_version': 'v2',
            'api_method': 'GET',
            'params': {
                'sortBy': 'creationDate:ASC',
                'detailsLevel': 'FULL',
                'paginationDetails': 'ON'
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id']
        },
        'custom_field_sets': {
            'path': 'customfieldsets',
            'api_version': 'v1',
            'api_method': 'GET',
            'params': {},
            'id_fields': ['id']
        },
        'deposit_accounts': {
            'path': 'deposits',
            'api_version': 'v2',
            'api_method': 'GET',
            'params': {
                'sortBy': 'lastModifiedDate:ASC',
                'detailsLevel': 'FULL'
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id'],
            'store_ids': True,
            'children': {
                'cards': {
                    'path': 'deposits/{}/cards',
                    'api_version': 'v2',
                    'api_method': 'GET',
                    'params': {
                        'detailsLevel': 'FULL'
                    },
                    'id_fields': ['deposit_id', 'reference_token'],
                    'parent': 'deposit'
                }
            }
        },
        'deposit_products': {
            'path': 'savingsproducts',
            'api_version': 'v1',
            'api_method': 'GET',
            'params': {
                "fullDetails": True
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id']
        },
        'deposit_transactions': {
            'path': 'deposits/transactions:search',
            'api_version': 'v2',
            'api_method': 'POST',
            'params': {
                'detailsLevel': 'FULL'
            },
            'body': {
                "sortingCriteria": {
                    "field": "creationDate",
                    "order": "ASC"
                },
                "filterCriteria": [{
                    "field": "creationDate",
                    "operator": "AFTER",
                    "value": deposit_transactions_dt_str
                }]
            },
            'bookmark_field': 'creation_date',
            'bookmark_type': 'datetime',
            'id_fields': ['encoded_key']
        },
        'groups': {
            'path': 'groups:search',
            'api_version': 'v2',
            'api_method': 'POST',
            'params': {
                'detailsLevel': 'FULL'
            },
            'body': {
                "sortingCriteria": {
                    "field": "lastModifiedDate",
                    "order": "ASC"
                },
                "filterCriteria": [{
                    "field": "lastModifiedDate",
                    "operator": "AFTER",
                    "value": groups_dt_str
                }]
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id']
        },
        'loan_accounts': {
            'path': 'loans',
            'api_version': 'v2',
            'api_method': 'GET',
            'params': {
                'sortBy': 'lastModifiedDate:ASC',
                'detailsLevel': 'FULL',
                'paginationDetails': 'ON'
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id'],
            'children': {
                'loan_repayments': {
                    'path': 'loans/{}/repayments',
                    'api_version': 'v1',
                    'api_method': 'GET',
                    'params': {
                        'detailsLevel': 'FULL',
                        'paginationDetails': 'ON'
                    },
                    'id_fields': ['encoded_key'],
                    'parent': 'loan_accounts'
                }
            }
        },
        'loan_products': {
            'path': 'loanproducts',
            'api_version': 'v1',
            'api_method': 'GET',
            'params': {
                "fullDetails": True
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id']
        },
        'loan_transactions': {
            'path': 'loans/transactions:search',
            'api_version': 'v2',
            'api_method': 'POST',
            'params': {
                'detailsLevel': 'FULL'
            },
            'body': {
                "sortingCriteria": {
                    "field": "creationDate",
                    "order": "ASC"
                },
                "filterCriteria": [{
                    "field": "creationDate",
                    "operator": "AFTER",
                    "value": loan_transactions_dt_str
                }]
            },
            'bookmark_field': 'creation_date',
            'bookmark_type': 'datetime',
            'id_fields': ['encoded_key']
        },
        'tasks': {
            'path': 'tasks',
            'api_version': 'v2',
            'api_method': 'GET',
            'params': {
                'sortBy': 'lastModifiedDate:ASC',
                'detailsLevel': 'FULL',
                'paginationDetails': 'ON'
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id']
        },
        'users': {
            'path': 'users',
            'api_version': 'v2',
            'api_method': 'GET',
            'params': {
                'sortBy': 'lastModifiedDate:ASC',
                'detailsLevel': 'FULL',
                'paginationDetails': 'ON'
            },
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'id_fields': ['id']
        },
        'gl_accounts': {
            'path': 'glaccounts',
            'api_version': 'v1',
            'api_method': 'GET',
            'params': {
                'type': '{sub_type}'
            },
            'id_fields': ['gl_code'],
            'bookmark_field': 'last_modified_date',
            'bookmark_type': 'datetime',
            'sub_types': ['ASSET', 'LIABILITY', 'EQUITY', 'INCOME', 'EXPENSE']
        },
        'gl_journal_entries': {
            'path': 'gljournalentries/search',
            'api_version': 'v1',
            'api_method': 'POST',
            'body': {
                "filterConstraints": [{
                    "filterSelection": "CREATION_DATE",
                    "filterElement": "BETWEEN",
                    "value": '{gl_journal_entries_from_dt_str}',
                    "secondValue": "{now_date_str}"
                }]
            },
            'id_fields': ['entry_id'],
            'bookmark_field': 'booking_date',
            'bookmark_type': 'datetime'
        },
        'activities': {
            'path': 'activities',
            'api_version': 'v1',
            'api_method': 'GET',
            'params': {
                'from': '{activities_from_dt_str}',
                'to': '{now_date_str}'
            },
            'id_fields': ['encoded_key'],
            'bookmark_field': 'timestamp',
            'bookmark_type': 'datetime'
        },
        'index_rate_sources': {
            'path': 'indexratesources',
            'api_version': 'v2',
            'api_method': 'GET',
            'id_fields': ['encoded_key'],
            'params': {}
        },
        'installments': {
            'path': 'installments',
            'api_version': 'v2',
            'api_method': 'GET',
            'id_fields': ['encoded_key'],
            'params': {
                'dueFrom': '{installments_from_dt_str}',
                'dueTo': '{now_date_str}'
            },
            'bookmark_field': 'last_paid_date',
            'bookmark_type': 'datetime'
        }
    }

    selected_streams = get_selected_streams(catalog)
    LOGGER.info('selected_streams: {}'.format(selected_streams))

    if not selected_streams:
        return

    # last_stream = Previous currently synced stream, if the load was interrupted
    last_stream = singer.get_currently_syncing(state)
    LOGGER.info('last/currently syncing stream: {}'.format(last_stream))

    # For each endpoint (above), determine if the stream should be streamed
    #   (based on the catalog and last_stream), then sync those streams.
    for stream_name, endpoint_config in endpoints.items():
        should_stream, last_stream = should_sync_stream(
            selected_streams, last_stream, stream_name)

        if should_stream:
            # loop through each sub type
            sub_types = endpoint_config.get('sub_types', ['self'])
            for sub_type in sub_types:
                LOGGER.info('START Syncing: {}, Type: {}'.format(
                    stream_name, sub_type))

                # Now date
                if stream_name == 'gl_journal_entries':
                    now_date_str = strftime(utils.now())[:10]
                    gl_journal_entries_from_dttm_str = get_bookmark(
                        state, 'gl_journal_entries', sub_type, start_date)
                    gl_journal_entries_from_dt_str = transform_datetime(
                        gl_journal_entries_from_dttm_str)[:10]
                    gl_journal_entries_from_param = endpoint_config.get(
                        'body', {}).get('filterConstraints',
                                        {})[0].get('value')
                    if gl_journal_entries_from_param:
                        endpoint_config['body']['filterConstraints'][0][
                            'value'] = gl_journal_entries_from_dt_str
                    gl_journal_entries_to_param = endpoint_config.get(
                        'body', {}).get('filterConstraints',
                                        {})[0].get('secondValue')
                    if gl_journal_entries_to_param:
                        endpoint_config['body']['filterConstraints'][0][
                            'secondValue'] = now_date_str

                if stream_name == 'activities':
                    now_date_str = strftime(utils.now())[:10]
                    activities_from_dttm_str = get_bookmark(
                        state, 'activities', sub_type, start_date)
                    activities_from_dt_str = transform_datetime(
                        activities_from_dttm_str)[:10]
                    activities_from_param = endpoint_config.get('params',
                                                                {}).get('from')
                    if activities_from_param:
                        endpoint_config['params'][
                            'from'] = activities_from_dt_str
                    activities_to_param = endpoint_config.get('params',
                                                              {}).get('to')
                    if activities_to_param:
                        endpoint_config['params']['to'] = now_date_str

                if stream_name == 'installments':
                    now_date_str = strftime(utils.now())[:10]
                    installments_from_dttm_str = get_bookmark(
                        state, 'installments', sub_type, start_date)
                    installments_from_dt_str = transform_datetime(
                        installments_from_dttm_str)[:10]
                    installments_from_param = endpoint_config.get(
                        'params', {}).get('dueFrom')
                    if installments_from_param:
                        endpoint_config['params'][
                            'dueFrom'] = installments_from_dt_str
                    installments_to_param = endpoint_config.get(
                        'params', {}).get('dueTo')
                    if installments_to_param:
                        endpoint_config['params']['dueTo'] = now_date_str

                update_currently_syncing(state, stream_name)
                path = endpoint_config.get('path')
                sub_type_param = endpoint_config.get('params', {}).get('type')
                if sub_type_param:
                    endpoint_config['params']['type'] = sub_type

                total_records = sync_endpoint(
                    client=client,
                    catalog=catalog,
                    state=state,
                    start_date=start_date,
                    stream_name=stream_name,
                    path=path,
                    endpoint_config=endpoint_config,
                    api_version=endpoint_config.get('api_version', 'v2'),
                    api_method=endpoint_config.get('api_method', 'GET'),
                    static_params=endpoint_config.get('params', {}),
                    sub_type=sub_type,
                    bookmark_query_field=endpoint_config.get(
                        'bookmark_query_field'),
                    bookmark_field=endpoint_config.get('bookmark_field'),
                    bookmark_type=endpoint_config.get('bookmark_type'),
                    data_key=endpoint_config.get('data_key', None),
                    body=endpoint_config.get('body', None),
                    id_fields=endpoint_config.get('id_fields'))

                update_currently_syncing(state, None)
                LOGGER.info('Synced: {}, total_records: {}'.format(
                    stream_name, total_records))
                LOGGER.info('FINISHED Syncing: {}'.format(stream_name))
Esempio n. 7
0
 def get_bookmark(self, state):
     bookmark = (get_bookmark(state, self.name, self.replication_key)
                 or self.start_date)
     return utils.strptime_to_utc(bookmark)
Esempio n. 8
0
    def sync_substream(self, state, parent, sub_stream, parent_response):
        bookmark_date = self.get_bookmark(state, sub_stream.name,
                                          self.config.get('start_date'),
                                          sub_stream.replication_key)
        # If last sync was interrupted, get last processed parent record
        last_processed = self.get_bookmark(state,
                                           sub_stream.name,
                                           None,
                                           key="last_processed")
        bookmark_dttm = strptime_to_utc(bookmark_date)
        new_bookmark = bookmark_dttm

        singer.write_schema(sub_stream.name,
                            sub_stream.stream.schema.to_dict(),
                            sub_stream.key_properties)

        # Slice response for >= last processed
        if last_processed:
            for i, e in enumerate(parent_response):
                if e.get(parent.key_properties[0]) == last_processed:
                    LOGGER.info("Resuming %s sync with %s", sub_stream.name,
                                e.get(parent.key_properties[0]))
                    parent_response = parent_response[i:len(parent_response)]
                    continue

        next_log_progress_percentage = 0
        for index, record in enumerate(parent_response):
            try:
                with metrics.record_counter(
                        sub_stream.name) as counter, Transformer(
                            integer_datetime_fmt=
                            "unix-milliseconds-integer-datetime-parsing"
                        ) as transformer:
                    stream_events = sub_stream.sync(
                        state, new_bookmark,
                        record.get(parent.key_properties[0]))
                    for event in stream_events:
                        counter.increment()

                        schema_dict = sub_stream.stream.schema.to_dict()
                        stream_metadata = metadata.to_map(
                            sub_stream.stream.metadata)

                        transformed_event = sub_stream.transform(event)

                        try:
                            transformed_record = transformer.transform(
                                transformed_event, schema_dict,
                                stream_metadata)
                        except Exception as err:
                            LOGGER.error('Error: %s', err)
                            LOGGER.error(
                                ' for schema: %s',
                                json.dumps(schema_dict,
                                           sort_keys=True,
                                           indent=2))
                            raise err

                        event_time = strptime_to_utc(
                            transformed_record.get(sub_stream.replication_key))

                        new_bookmark = max(new_bookmark, event_time)
                        singer.write_record(sub_stream.stream.tap_stream_id,
                                            transformed_record)

            except HTTPError:
                LOGGER.warning(
                    "Unable to retrieve %s Event for Stream (ID: %s)",
                    sub_stream.name, record[parent.key_properties[0]])

            # All events for all parents processed; can removed last processed
            self.update_bookmark(state=state,
                                 stream=sub_stream.name,
                                 bookmark_value=record.get(
                                     parent.key_properties[0]),
                                 bookmark_key="last_processed")
            self.update_bookmark(state=state,
                                 stream=sub_stream.name,
                                 bookmark_value=strftime(new_bookmark),
                                 bookmark_key=sub_stream.replication_key)

            progress_percentage = float(index) / len(parent_response) * 100
            if progress_percentage > next_log_progress_percentage:
                LOGGER.info(
                    "Finished syncing %s percentage of sub_stream for parent %s's sub_stream %s data",
                    progress_percentage, parent.name, sub_stream.name)
                next_log_progress_percentage += self.LOG_PROGRESS_PERCENTAGE_INTERVAL

        # After processing for all parent ids we can remove our resumption state
        state.get('bookmarks').get(sub_stream.name).pop('last_processed')
        update_currently_syncing(state, None)
Esempio n. 9
0
def sync_endpoint(client,  # pylint: disable=too-many-branches
                  catalog,
                  state,
                  start_date,
                  stream_name,
                  path,
                  endpoint_config,
                  static_params,
                  bookmark_query_field=None,
                  bookmark_field=None,
                  bookmark_type=None,
                  data_key=None,
                  id_fields=None,
                  selected_streams=None,
                  replication_ind=None,
                  parent=None,
                  parent_id=None):

    # Get the latest bookmark for the stream and set the last_integer/datetime
    last_datetime = None
    last_integer = None
    max_bookmark_value = None
    if bookmark_type == 'integer':
        last_integer = get_bookmark(state, stream_name, 0)
        max_bookmark_value = last_integer
    else:
        last_datetime = get_bookmark(state, stream_name, start_date)
        max_bookmark_value = last_datetime
        LOGGER.info('{}, initial max_bookmark_value {}'.format(stream_name, max_bookmark_value))
        max_bookmark_dttm = strptime_to_utc(last_datetime)
        max_bookmark_int = int(time.mktime(max_bookmark_dttm.timetuple()))
        now_int = int(time.time())
        updated_since_sec = now_int - max_bookmark_int
        updated_since_days = math.ceil(updated_since_sec/(24 * 60 * 60))

    # pagination: loop thru all pages of data using next_url (if not None)
    page = 1
    offset = 0
    # Default per_page limit is 50, max is 60
    limit = endpoint_config.get('batch_size', 60)
    total_records = 0

    # Check scroll_type to determine if to use Scroll API
    #   scroll_types: always, never.
    #   Endpoints:
    #       always: customers
    #       never: all others
    # Scroll API: https://developers.intercom.io/reference?_ga=2.237132992.1579857338.1569387987-1032864292.1569297580#iterating-over-all-users
    scroll_type = endpoint_config.get('scroll_type', 'never')

    # Check whether the endpoint supports a cursor
    # https://developers.intercom.com/intercom-api-reference/reference#pagination-cursor
    cursor = endpoint_config.get('cursor', False)
    search = endpoint_config.get('search', False)

    # Scroll for always re-syncs
    if scroll_type == 'always':
        LOGGER.info('Stream: {}, Historical Sync, Using Scoll API'.format(stream_name))
        is_scrolling = True
        next_url = '{}/{}/scroll'.format(client.base_url, path)
        params = {}
    else:
        is_scrolling = False
        next_url = '{}/{}'.format(client.base_url, path)

        # INTERPOLATE PAGE:
        # Endpoints: conversations and leads
        # Pre-requisites: Endpoint allows SORT ASC by bookmark and PAGING, but does not provide query filtering params.
        # Interpolate Page: Find start page based on sorting results, bookmark datetime, and binary search algorithm.
        #    Algorithm tries to estimate start page based on bookmark, and then splits the difference if it
        #       exceeds the start page or falls short of the start page based on the page's 1st and last record bookmarks.
        interpolate_page = endpoint_config.get('interpolate_page', False)
        if interpolate_page:
            # Interpolate based on current page, total_pages, and updated_at to get first page
            min_page = 1
            max_page = 4  # initial value, reset to total_pages on 1st API call
            i = 1
            while (max_page - min_page) > 2:
                params = {
                    'page': page,
                    'per_page': limit,
                    **static_params  # adds in endpoint specific, sort, filter params
                }
                querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
                # API request data
                data = {}
                data = client.get(
                    path=path,
                    params=querystring,
                    endpoint=stream_name)
                page = int(data.get('pages', {}).get('page'))
                # per_page = int(data.get('pages', {}).get('per_page'))
                total_pages = int(data.get('pages', {}).get('total_pages'))
                if i == 1:
                    max_page = total_pages
                list_len = len(data.get(data_key, []))
                LOGGER.info('Interpolate start page: i = {}, page = {}, min_page = {}, max_page = {}'.format(i, page, min_page, max_page))
                first_record_updated_int = data.get(data_key, [])[0].get('updated_at')
                last_record_updated_int = data.get(data_key, [])[list_len - 1].get('updated_at')
                if i == 1:
                    # FIRST GUESS - based on TOTAL PAGES, last bookmark, and % of time difference: (bookmark - 1st Record) / (NOW - 1st Record)
                    # Get next_page based on proportional ratio of time integers
                    #  If bookmark datetime in at 90% of (NOW - 1st Record) and there are 100 pages TOTAL, then try page 90
                    #  NOTE: It is better for NEXT GUESSES to slightly under-shoot - so that splitting the difference is smaller.
                    #    If you start at 1, then over-shoot to 90, but the 1st Page is 89, the next_page guess will be 45.
                    #    This is why next_page is 95% of pct_time x total_pages
                    pct_time = ((max_bookmark_int - first_record_updated_int)/(now_int - first_record_updated_int))
                    LOGGER.info('Interpolate percent based on time diff: {}%'.format(math.floor(pct_time * 100)))
                    next_page = math.floor(0.95 * pct_time * total_pages)  # Adjust 1st GUESS to lower by 5% to under-shoot
                    LOGGER.info('  next_page = {}'.format(next_page))
                elif first_record_updated_int <= max_bookmark_int and last_record_updated_int >= max_bookmark_int:
                    # First page found, stop looping
                    min_page = page
                    LOGGER.info('First page found. page = {}'.format(page))
                    break
                elif last_record_updated_int < max_bookmark_int:
                    # Increase page by half
                    min_page = page
                    next_page = page + math.ceil((1 + max_page - min_page) / 2)
                    LOGGER.info('Increase page. next_page = {}'.format(next_page))
                elif first_record_updated_int > max_bookmark_int:
                    # Decrease the page by half
                    max_page = page
                    next_page = page - math.floor((1 + max_page - min_page) / 2)
                    LOGGER.info('Decrease page. next_page = {}'.format(next_page))
                else:
                    # Break out of loop
                    break
                page = next_page
                i = i + 1
            # Set params to interpolated page
            params = {
                'page': min_page,
                'per_page': limit,
                **static_params  # adds in endpoint specific, sort, filter params
            }
            # FINISH INTERPOLATION
        elif cursor:
            params = {
                'per_page': limit,
                **static_params
            }
        # NORMAL SYNC - Not SCROLLING, Not INTERPOLATION
        #   Standard INCREMENTAL or FULL TABLE
        else:
            params = {
                'page': page,
                'per_page': limit,
                **static_params  # adds in endpoint specific, sort, filter params
            }

    request_body = None
    # Initial search query contains only a starting_time
    if search:
        search_query = endpoint_config.get('search_query')
        request_body = build_query(search_query, max_bookmark_int)

    i = 1
    while next_url is not None:
        # Need URL querystring for 1st page; subsequent pages provided by next_url
        # querystring: Squash query params into string
        if i == 1 and not is_scrolling:
            if bookmark_query_field:
                if bookmark_type == 'datetime':
                    params[bookmark_query_field] = updated_since_days
                elif bookmark_type == 'integer':
                    params[bookmark_query_field] = last_integer
            if params != {}:
                querystring = '&'.join(['%s=%s' % (key, value) for (key, value) in params.items()])
        else:
            querystring = None
        LOGGER.info('URL for Stream {}: {}{}'.format(
            stream_name,
            next_url,
            '?{}'.format(querystring) if querystring else ''))

        # API request data
        data = {}
        data = client.perform(
            method=endpoint_config.get('method'),
            url=next_url,
            path=path,
            params=querystring,
            endpoint=stream_name,
            json=request_body)

        # LOGGER.info('data = {}'.format(data)) # TESTING, comment out

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        if not data or data is None or data == {}:
            break  # No data results

        # Transform data with transform_json from transform.py
        # The data_key identifies the array/list of records below the <root> element.
        # SINGLE RECORD data results appear as dictionary.
        # MULTIPLE RECORD data results appear as an array-list under the data_key.
        # The following code converts ALL results to an array-list and transforms data.
        transformed_data = []  # initialize the record list
        data_list = []
        data_dict = {}
        if isinstance(data, list) and data_key not in data:
            data_list = data
            data_dict[data_key] = data_list
            transformed_data = transform_json(data_dict, stream_name, data_key)
        elif isinstance(data, dict) and data_key not in data:
            data_list.append(data)
            data_dict[data_key] = data_list
            transformed_data = transform_json(data_dict, stream_name, data_key)
        else:
            transformed_data = transform_json(data, stream_name, data_key)
        # LOGGER.info('transformed_data = {}'.format(transformed_data))  # TESTING, comment out
        if not transformed_data or transformed_data is None:
            if parent_id is None:
                LOGGER.info('Stream: {}, No transformed data for data = {}'.format(
                    stream_name, data))
            break  # No data results
        # Verify key id_fields are present
        rec_count = 0
        for record in transformed_data:
            for key in id_fields:
                if not record.get(key):
                    LOGGER.info('Stream: {}, Missing key {} in record: {}'.format(
                        stream_name, key, record))
                    raise RuntimeError
            rec_count = rec_count + 1

        # Process records and get the max_bookmark_value and record_count for the set of records
        if replication_ind:
            max_bookmark_value, record_count = process_records(
                catalog=catalog,
                stream_name=stream_name,
                records=transformed_data,
                time_extracted=time_extracted,
                bookmark_field=bookmark_field,
                bookmark_type=bookmark_type,
                max_bookmark_value=max_bookmark_value,
                last_datetime=last_datetime,
                last_integer=last_integer,
                parent=parent,
                parent_id=parent_id)
            LOGGER.info('Stream {}, batch processed {} records'.format(
                stream_name, record_count))
        else:
            record_count = 0

        # Loop thru parent batch records for each children objects (if should stream)
        children = endpoint_config.get('children')
        if children:
            for child_stream_name, child_endpoint_config in children.items():
                if child_stream_name in selected_streams:
                    child_replication_ind = child_endpoint_config.get('replication_ind', True)
                    if child_replication_ind:
                        write_schema(catalog, child_stream_name)
                        child_selected_fields = get_selected_fields(catalog, child_stream_name)
                        LOGGER.info('Stream: {}, selected_fields: {}'.format(
                            child_stream_name, child_selected_fields))
                        total_child_records = 0
                        # For each parent record
                        for record in transformed_data:
                            i = 0
                            # Set parent_id
                            for id_field in id_fields:
                                if i == 0:
                                    parent_id_field = id_field
                                if id_field == 'id':
                                    parent_id_field = id_field
                                i = i + 1
                            parent_id = record.get(parent_id_field)

                            # sync_endpoint for child
                            LOGGER.info('Syncing: {}, parent_stream: {}, parent_id: {}'.format(
                                child_stream_name,
                                stream_name,
                                parent_id))
                            child_path = child_endpoint_config.get('path', child_stream_name).format(
                                str(parent_id))
                            child_bookmark_field = next(iter(child_endpoint_config.get(
                                'replication_keys', [])), None)
                            child_total_records = sync_endpoint(
                                client=client,
                                catalog=catalog,
                                state=state,
                                start_date=start_date,
                                stream_name=child_stream_name,
                                path=child_path,
                                endpoint_config=child_endpoint_config,
                                static_params=child_endpoint_config.get('params', {}),
                                bookmark_query_field=child_endpoint_config.get(
                                    'bookmark_query_field', None),
                                bookmark_field=child_bookmark_field,
                                bookmark_type=child_endpoint_config.get('bookmark_type', None),
                                data_key=child_endpoint_config.get('data_key', child_stream_name),
                                id_fields=child_endpoint_config.get('key_properties'),
                                selected_streams=selected_streams,
                                replication_ind=child_replication_ind,
                                parent=child_endpoint_config.get('parent'),
                                parent_id=parent_id)
                            LOGGER.info('Synced: {}, parent_id: {}, records: {}'.format(
                                child_stream_name,
                                parent_id,
                                child_total_records))
                            total_child_records = total_child_records + child_total_records
                        LOGGER.info('Parent Stream: {}, Child Stream: {}, FINISHED PARENT BATCH'.format(
                            stream_name, child_stream_name))
                        LOGGER.info('Synced: {}, total_records: {}'.format(
                            child_stream_name,
                            total_child_records))

        # set total_records and next_url for pagination
        total_records = total_records + record_count
        if is_scrolling:
            scroll_param = data.get('scroll_param')
            if not scroll_param:
                break
            next_url = '{}/{}/scroll?scroll_param={}'.format(client.base_url, path, scroll_param)
        elif cursor:
            pagination = data.get('pages', {}).get('next', {})
            starting_after = pagination.get('starting_after', None)
            next_url = '{}/{}?starting_after={}'.format(client.base_url, path, starting_after)
        elif search:
            pagination = data.get('pages', {}).get('next', {})
            starting_after = pagination.get('starting_after', None)
            # Subsequent search queries require starting_after
            if starting_after:
                request_body = build_query(search_query, max_bookmark_int, starting_after)
            else:
                next_url = None
        else:
            next_url = data.get('pages', {}).get('next', None)

        # Update the state with the max_bookmark_value for non-scrolling
        if bookmark_field and not is_scrolling:
            write_bookmark(state, stream_name, max_bookmark_value)

        # to_rec: to record; ending record for the batch page
        to_rec = offset + rec_count
        LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format(
            stream_name,
            page,
            offset,
            to_rec))
        # Pagination: increment the offset by the limit (batch-size) and page
        offset = offset + rec_count
        page = page + 1
        i = i + 1

    # Return total_records across all pages
    LOGGER.info('Synced Stream: {}, pages: {}, total records: {}'.format(
        stream_name,
        page - 1,
        total_records))

    # Update the state with the max_bookmark_value for non-scrolling
    if bookmark_field and is_scrolling:
        write_bookmark(state, stream_name, max_bookmark_value)

    return total_records
Esempio n. 10
0
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        path,
        endpoint_config,
        static_params,
        bookmark_query_field=None,
        bookmark_field=None,
        bookmark_type=None,
        data_key=None,
        id_fields=None,
        selected_streams=None,
        parent=None,
        parent_id=None):

    # Get the latest bookmark for the stream and set the last_integer/datetime
    last_datetime = None
    last_integer = None
    max_bookmark_value = None
    if bookmark_type == 'integer':
        last_integer = get_bookmark(state, stream_name, 0)
        max_bookmark_value = last_integer
    else:
        last_datetime = get_bookmark(state, stream_name, start_date)
        max_bookmark_value = last_datetime
        max_bookmark_dttm = strptime_to_utc(last_datetime)
        max_bookmark_int = int(time.mktime(max_bookmark_dttm.timetuple()))
        now_int = int(time.time())
        updated_since_sec = now_int - max_bookmark_int
        updated_since_days = math.ceil(updated_since_sec / (24 * 60 * 60))

    # pagination: loop thru all pages of data using next_url (if not None)
    page = 1
    offset = 0
    limit = 100  # Default per_page limit is 100
    total_endpoint_records = 0
    next_url = '{}/{}'.format(client.base_url, path)
    params = {
        'page': page,
        'per': limit,
        **static_params  # adds in endpoint specific, sort, filter params
    }

    total_processed_records = 0

    while next_url is not None:
        # Need URL querystring for 1st page; subsequent pages provided by next_url
        # querystring: Squash query params into string
        if page == 1:
            if bookmark_query_field:
                if bookmark_type == 'datetime':
                    params[bookmark_query_field] = start_date
                elif bookmark_type == 'integer':
                    params[bookmark_query_field] = last_integer
            if params != {}:
                querystring = '&'.join([
                    '%s=%s' % (key, value) for (key, value) in params.items()
                ])
        else:
            querystring = None
        LOGGER.info('URL for Stream {}: {}{}'.format(
            stream_name, next_url,
            '?{}'.format(querystring) if querystring else ''))

        # API request data
        # total_endpoint_records: API response for all pages
        data = {}
        data, total_endpoint_records, next_url = client.get(
            url=next_url, path=path, params=querystring, endpoint=stream_name)

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        if not data or data is None or data == {}:
            return total_endpoint_records  # No data results

        # Transform data with transform_json from transform.py
        # The data_key identifies the array/list of records below the <root> element
        transformed_data = []  # initialize the record list
        data_list = []
        # data_dict = {}
        if isinstance(data, list) and not data_key in data:
            data_list = data
            transformed_data = transform_json(data, stream_name, data_key)

        if not transformed_data or transformed_data is None:
            LOGGER.info('No transformed data for data = {}'.format(data))
            return total_endpoint_records  # No data results

        total_submitted_records = len(transformed_data)

        rec_count = 0

        # Process records and get the max_bookmark_value and record_count for the set of records
        max_bookmark_value, record_count = process_records(
            catalog=catalog,
            stream_name=stream_name,
            records=transformed_data,
            time_extracted=time_extracted,
            bookmark_field=bookmark_field,
            bookmark_type=bookmark_type,
            max_bookmark_value=max_bookmark_value,
            last_datetime=last_datetime,
            last_integer=last_integer,
            parent=parent,
            parent_id=parent_id)

        total_processed_records = total_processed_records + record_count
        LOGGER.info(
            'Stream {}, batch processed {} records, total processed records {}'
            .format(stream_name, record_count, total_processed_records))

        # Update the state with the max_bookmark_value for the stream
        if bookmark_field:
            write_bookmark(state, stream_name, max_bookmark_value)

        # to_rec: to record; ending record for the batch page
        to_rec = offset + rec_count
        LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format(
            stream_name, page, offset, to_rec))
        # Pagination: increment the offset by the limit (batch-size) and page
        offset = offset + rec_count
        page = page + 1

    # Return total_endpoint_records across all pages
    LOGGER.info('Synced Stream: {}, pages: {}, total records: {}'.format(
        stream_name, page - 1, total_endpoint_records))
    return total_endpoint_records
Esempio n. 11
0
def sync_companies(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))
    bumble_bee = Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING)
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_to_utc(get_start(STATE, "companies", bookmark_key))
    LOGGER.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("companies", schema, ["companyId"], [bookmark_key],
                        catalog.get('stream_alias'))

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE,
                                                "companies") or utils.now()
    STATE = write_current_sync_start(STATE, "companies", current_sync_start)
    singer.write_state(STATE)

    url = get_url("companies_all")
    max_bk_value = start
    if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
        contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
        singer.write_schema("contacts_by_company", contacts_by_company_schema,
                            ["company-id", "contact-id"])

    with bumble_bee:
        for row in gen_request(STATE, 'companies', url, default_company_params,
                               'companies', 'has-more', ['offset'],
                               ['offset']):
            row_properties = row['properties']
            modified_time = None
            if bookmark_key in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties[bookmark_key][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)
            elif 'createdate' in row_properties:
                # Hubspot returns timestamps in millis
                timestamp_millis = row_properties['createdate'][
                    'timestamp'] / 1000.0
                modified_time = datetime.datetime.fromtimestamp(
                    timestamp_millis, datetime.timezone.utc)

            if modified_time and modified_time >= max_bk_value:
                max_bk_value = modified_time

            if not modified_time or modified_time >= start:
                record = request(
                    get_url("companies_detail",
                            company_id=row['companyId'])).json()
                record = bumble_bee.transform(
                    lift_properties_and_versions(record), schema, mdata)
                singer.write_record("companies",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=utils.now())
                if CONTACTS_BY_COMPANY in ctx.selected_stream_ids:
                    STATE = _sync_contacts_by_company(STATE, ctx,
                                                      record['companyId'])

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(max_bk_value, current_sync_start)
    STATE = singer.write_bookmark(STATE, 'companies', bookmark_key,
                                  utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'companies', None)
    singer.write_state(STATE)
    return STATE
Esempio n. 12
0
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        search_path,
        endpoint_config,
        bookmark_field=None,
        selected_streams=None):

    # Endpoint parameters
    bookmark_query_field = endpoint_config.get('bookmark_query_field', None)
    data_key = endpoint_config.get('data_key', stream_name)
    LOGGER.info('data_key = {}'.format(data_key))

    # Get the latest bookmark for the stream and set the last_datetime
    last_datetime = get_bookmark(state, stream_name, start_date)
    file_max_bookmark_value = last_datetime

    # Convert to GitHub date format, example: Sun, 13 Oct 2019 22:40:01 GMT
    last_dttm = strptime_to_utc(last_datetime)
    last_modified = last_dttm.strftime("%a, %d %b %Y %H:%M:%S %Z'")
    LOGGER.info('HEADER If-Modified-Since: {}'.format(last_modified))

    # Write schema and log selected fields for file stream and child csv stream(s)
    write_schema(catalog, stream_name)
    selected_fields = get_selected_fields(catalog, stream_name)
    LOGGER.info('Stream: {}, selected_fields: {}'.format(
        stream_name, selected_fields))
    children = endpoint_config.get('children')
    if children:
        for child_stream_name, child_endpoint_config in children.items():
            if child_stream_name in selected_streams:
                write_schema(catalog, child_stream_name)
                child_selected_fields = get_selected_fields(
                    catalog, child_stream_name)
                LOGGER.info('Stream: {}, selected_fields: {}'.format(
                    child_stream_name, child_selected_fields))

    # pagination: loop thru all pages of data using next_url (if not None)
    page = 1
    offset = 0
    file_total_records = 0
    csv_total_records = 0
    next_url = '{}/{}'.format(client.base_url, search_path)

    i = 1
    while next_url is not None:
        LOGGER.info('Search URL for Stream {}: {}'.format(
            stream_name, next_url))

        # API request search_data
        search_data = {}
        search_data, next_url = client.get(url=next_url, endpoint=stream_name)
        LOGGER.info('next_url = {}'.format(next_url))
        # LOGGER.info('search_data = {}'.format(search_data)) # COMMENT OUT

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        search_items = search_data.get(data_key, [])
        if not search_items:
            LOGGER.info('Stream: {}, no files found'.format(stream_name))
            break  # No data results

        file_count = 0
        file_records = []
        csv_records = []
        for item in search_items:
            file_count = file_count + 1
            file_url = item.get('url')
            LOGGER.info('File URL for Stream {}: {}'.format(
                stream_name, file_url))
            file_data = {}
            headers = {}
            if bookmark_query_field:
                headers[bookmark_query_field] = last_modified
            # API request file_data for item, single-file (ignore file_next_url)
            file_data, file_next_url = client.get(url=file_url,
                                                  headers=headers,
                                                  endpoint=stream_name)
            # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT

            if file_data:
                content = file_data.get('content')
                content_list = []
                if content:
                    content_b64 = base64.b64decode(content)
                    content_str = content_b64.decode('utf-8')
                    file_like_object = io.StringIO(content_str)

                    with file_like_object as f:
                        reader = csv.DictReader(f)
                        content_list = [r for r in reader]

                file_modified = file_data.get('last_modified')
                file_sha = file_data.get('sha')
                file_path = file_data.get('path')
                file_name = file_data.get('name')

                # Remove _links, content nodes
                file_data.pop('_links', None)
                file_data.pop('content', None)

                # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT
                file_records.append(file_data)

                # Loop thru each child object and append csv records
                if children:
                    for child_stream_name, child_endpoint_config in children.items(
                    ):
                        if child_stream_name in selected_streams:
                            i = 1
                            for record in content_list:
                                record['git_path'] = file_path
                                record['git_sha'] = file_sha
                                record['git_last_modified'] = file_modified
                                record['git_file_name'] = file_name
                                record['row_number'] = i

                                # Transform record and append
                                transformed_csv_record = {}
                                transformed_csv_record = transform_record(
                                    child_stream_name, record)
                                csv_records.append(transformed_csv_record)

                                i = i + 1

        # Process file_records and get the max_bookmark_value and record_count
        file_max_bookmark_value, file_record_count = process_records(
            catalog=catalog,
            stream_name=stream_name,
            records=file_records,
            time_extracted=time_extracted,
            bookmark_field=bookmark_field,
            max_bookmark_value=file_max_bookmark_value,
            last_datetime=last_datetime)
        LOGGER.info('Stream {}, batch processed {} records'.format(
            stream_name, file_record_count))
        file_total_records = file_total_records + file_record_count

        # Loop thru each child object to process csv records
        if children:
            for child_stream_name, child_endpoint_config in children.items():
                if child_stream_name in selected_streams:
                    csv_max_bookmark_value, csv_record_count = process_records(
                        catalog=catalog,
                        stream_name=child_stream_name,
                        records=csv_records,
                        time_extracted=time_extracted,
                        bookmark_field=None,
                        max_bookmark_value=None,
                        last_datetime=last_datetime)
                    LOGGER.info('Stream {}, batch processed {} records'.format(
                        child_stream_name, csv_record_count))
                    csv_total_records = csv_total_records + csv_record_count

        # to_rec: to record; ending record for the batch page
        to_rec = offset + file_count
        LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format(
            stream_name, page, offset, to_rec))
        # Pagination: increment the offset by the limit (batch-size) and page
        offset = offset + file_count
        page = page + 1
        i = i + 1

    # Update the state with the max_bookmark_value for the stream
    if bookmark_field:
        write_bookmark(state, stream_name, file_max_bookmark_value)

    # Return total_records across all pages
    LOGGER.info(
        'Synced Stream: {}, TOTAL pages: {}, file records: {}, csv records: {}'
        .format(stream_name, page - 1, file_total_records, csv_total_records))
    return file_total_records
Esempio n. 13
0
def sync_in_app_events():

    schema = load_schema("raw_data/in_app_events")
    singer.write_schema("in_app_events", schema, [
        "event_time",
        "event_name",
        "appsflyer_id"
    ])

    # This order matters
    fieldnames = (
        "attributed_touch_type",
        "attributed_touch_time",
        "install_time",
        "event_time",
        "event_name",
        "event_value",
        "event_revenue",
        "event_revenue_currency",
        "event_revenue_usd",
        "event_source",
        "is_receipt_validated",
        "af_prt",
        "media_source",
        "af_channel",
        "af_keywords",
        "campaign",
        "af_c_id",
        "af_adset",
        "af_adset_id",
        "af_ad",
        "af_ad_id",
        "af_ad_type",
        "af_siteid",
        "af_sub_siteid",
        "af_sub1",
        "af_sub2",
        "af_sub3",
        "af_sub4",
        "af_sub5",
        "af_cost_model",
        "af_cost_value",
        "af_cost_currency",
        "contributor1_af_prt",
        "contributor1_media_source",
        "contributor1_campaign",
        "contributor1_touch_type",
        "contributor1_touch_time",
        "contributor2_af_prt",
        "contributor2_media_source",
        "contributor2_campaign",
        "contributor2_touch_type",
        "contributor2_touch_time",
        "contributor3_af_prt",
        "contributor3_media_source",
        "contributor3_campaign",
        "contributor3_touch_type",
        "contributor3_touch_time",
        "region",
        "country_code",
        "state",
        "city",
        "postal_code",
        "dma",
        "ip",
        "wifi",
        "operator",
        "carrier",
        "language",
        "appsflyer_id",
        "advertising_id",
        "idfa",
        "android_id",
        "customer_user_id",
        "imei",
        "idfv",
        "platform",
        "device_type",
        "os_version",
        "app_version",
        "sdk_version",
        "app_id",
        "app_name",
        "bundle_id",
        "is_retargeting",
        "retargeting_conversion_type",
        "af_attribution_lookback",
        "af_reengagement_window",
        "is_primary_attribution",
        "user_agent",
        "http_referrer",
        "original_url",
    )

    stop_time = datetime.datetime.now(pytz.utc)
    from_datetime = get_start("in_app_events")
    to_datetime = get_stop(from_datetime, stop_time, 10)

    while from_datetime < stop_time:
        LOGGER.info("Syncing data from %s to %s", from_datetime, to_datetime)
        params = dict()
        params["from"] = from_datetime.strftime("%Y-%m-%d %H:%M")
        params["to"] = to_datetime.strftime("%Y-%m-%d %H:%M")
        params["api_token"] = CONFIG["api_token"]

        url = get_url("in_app_events", app_id=CONFIG["app_id"])
        request_data = request(url, params)

        csv_data = RequestToCsvAdapter(request_data)
        reader = csv.DictReader(csv_data, fieldnames)

        next(reader) # Skip the heading row

        bookmark = from_datetime
        for i, row in enumerate(reader):
            record = xform(row, schema)
            singer.write_record("in_app_events", record)
            # AppsFlyer returns records in order of most recent first.
            if utils.strptime_to_utc(record["event_time"]) > bookmark:
                bookmark = utils.strptime_to_utc(record["event_time"])

        # Write out state
        utils.update_state(STATE, "in_app_events", bookmark)
        singer.write_state(STATE)

        # Move the timings forward
        from_datetime = to_datetime
        to_datetime = get_stop(from_datetime, stop_time, 10)
Esempio n. 14
0
def sync_companies(state: State):
    bookmark_key = 'hs_lastmodifieddate'
    start = utils.strptime_to_utc(get_start(state, "companies", bookmark_key))
    logger.info("sync_companies from %s", start)
    schema = load_schema('companies')
    singer.write_schema("hubspot_companies", schema, ["companyId"],
                        [bookmark_key])

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(state,
                                                "companies") or utils.now()
    state = write_current_sync_start(state, "companies", current_sync_start)
    singer.write_state(state)

    url = get_url("companies_all")
    max_bk_value = start
    contacts_by_company_schema = load_schema(CONTACTS_BY_COMPANY)
    singer.write_schema("hubspot_contacts_by_company",
                        contacts_by_company_schema,
                        ["company-id", "contact-id"])

    for row in gen_request(state, 'companies', url, default_company_params,
                           'companies', 'has-more', ['offset'], ['offset']):
        row_properties = row['properties']
        modified_time = None
        if bookmark_key in row_properties:
            # Hubspot returns timestamps in millis
            timestamp_millis = row_properties[bookmark_key][
                'timestamp'] / 1000.0
            modified_time = datetime.datetime.fromtimestamp(
                timestamp_millis, datetime.timezone.utc)
        elif 'createdate' in row_properties:
            # Hubspot returns timestamps in millis
            timestamp_millis = row_properties['createdate'][
                'timestamp'] / 1000.0
            modified_time = datetime.datetime.fromtimestamp(
                timestamp_millis, datetime.timezone.utc)

        if modified_time and modified_time >= max_bk_value:
            max_bk_value = modified_time

        if not modified_time or modified_time >= start:
            record = request(
                get_url("companies_detail",
                        company_id=row['companyId'])).json()
            record = build_record(record, schema)
            write_record('hubspot_companies', record)
            state = _sync_contacts_by_company(state, record['companyId'])

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(max_bk_value, current_sync_start)
    state = singer.write_bookmark(state, 'hubspot_companies', bookmark_key,
                                  utils.strftime(new_bookmark))
    state = write_current_sync_start(state, 'companies', None)
    singer.write_state(state)
    return state
def sync_endpoint(client,
                  config,
                  catalog,
                  state,
                  stream_name,
                  endpoint_config,
                  sync_streams,
                  selected_streams,
                  parent_id=None):

    # endpoint_config variables
    base_path = endpoint_config.get('path', stream_name)
    bookmark_field = next(iter(endpoint_config.get('replication_keys', [])),
                          None)
    params = endpoint_config.get('params', {})
    bookmark_query_field_from = endpoint_config.get(
        'bookmark_query_field_from')
    bookmark_query_field_to = endpoint_config.get('bookmark_query_field_to')
    data_key_array = endpoint_config.get('data_key_array')
    id_fields = endpoint_config.get('key_properties')
    parent = endpoint_config.get('parent')
    date_window_size = int(endpoint_config.get('date_window_size', '1'))

    # tap config variabless
    start_date = config.get('start_date')
    attribution_window = config.get('attribution_window', 30)

    last_datetime = get_bookmark(state, stream_name, start_date,
                                 bookmark_field, parent, parent_id)
    max_bookmark_value = last_datetime

    # Convert to datetimes in local/ad account timezone
    now_datetime = utils.now()
    last_dttm = strptime_to_utc(last_datetime)

    if bookmark_query_field_from and bookmark_query_field_to:
        # date_window_size: Number of days in each date window
        # Set start window
        start_window = now_datetime - timedelta(days=attribution_window)
        if last_dttm < start_window:
            start_window = last_dttm + timedelta(
                days=1)  # makes sure that we don't have duplicated data
        # Set end window
        end_window = start_window + timedelta(days=date_window_size)
        if end_window > now_datetime:
            end_window = now_datetime

    else:
        start_window = last_dttm
        end_window = now_datetime
        diff_sec = (end_window - start_window).seconds
        date_window_size = math.ceil(
            diff_sec / (3600 * 24))  # round-up difference to days

    endpoint_total = 0
    total_records = 0

    while start_window < now_datetime:
        LOGGER.info('START Sync for Stream: {}{}'.format(
            stream_name,
            ', Date window from: {} to {}'.format(start_window.date(), end_window.date()) \
                if bookmark_query_field_from else ''))

        if bookmark_query_field_from and bookmark_query_field_to:
            # Query parameter startDate and endDate must be in Eastern time zone
            # API will error if future dates are requested

            # DAY based
            window_start_dt_str = start_window.date().strftime(
                '%Y-%m-%dT00:00:00')
            window_end_dt_str = end_window.date().strftime('%Y-%m-%dT23:59:59')

            params[bookmark_query_field_from] = window_start_dt_str
            params[bookmark_query_field_to] = window_end_dt_str

        path = base_path.format(parent_id=parent_id)

        total_records = 0

        # concate params
        querystring = '&'.join(
            ['%s=%s' % (key, value) for (key, value) in params.items()])

        # initialize url
        url = '{}/{}?{}'.format(client.base_url, path, querystring)

        # API request data
        data = {}
        try:
            data = client.get(url=url, endpoint=stream_name)
        except Exception as err:
            LOGGER.error('{}'.format(err))
            LOGGER.error('URL for Stream {}: {}'.format(stream_name, url))
            raise Exception(err)

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        if not data or data is None or data == {}:
            LOGGER.info('No data results returned')
        else:
            # Transform data with transform_json from transform.py
            # The data_key_array identifies the array/list of records below the <root> element
            # LOGGER.info('data = {}'.format(data)) # TESTING, comment out
            transformed_data = []  # initialize the record list

            if data_key_array:
                data_records = data.get(data_key_array, [])
            else:
                data_records = data

            for record in data_records:
                # Add parent id field/value
                if parent and parent_id and parent not in record:
                    record[parent] = parent_id

                # transform record (remove inconsistent use of CamelCase)
                try:
                    transformed_record = humps.decamelize(record)
                except Exception as err:
                    LOGGER.error('{}'.format(err))
                    LOGGER.error('error record: {}'.format(record))
                    raise Exception(err)

                transformed_data.append(transformed_record)
                # End for record in array
            # End non-stats stream

            # LOGGER.info('transformed_data = {}'.format(transformed_data)) # COMMENT OUT
            if not transformed_data or transformed_data is None:
                LOGGER.info('No transformed data for data = {}'.format(data))
            else:

                # Process records and get the max_bookmark_value and record_count
                if stream_name in sync_streams:
                    max_bookmark_value, record_count = process_records(
                        catalog=catalog,
                        stream_name=stream_name,
                        records=transformed_data,
                        time_extracted=time_extracted,
                        bookmark_field=bookmark_field,
                        max_bookmark_value=max_bookmark_value,
                        last_datetime=last_datetime)
                    LOGGER.info('Stream {}, batch processed {} records'.format(
                        stream_name, record_count))

                # Loop thru parent batch records for each children objects (if should stream)
                children = endpoint_config.get('children')
                if children:
                    for child_stream_name, child_endpoint_config in children.items(
                    ):
                        if child_stream_name in sync_streams:
                            LOGGER.info(
                                'START Syncing: {}'.format(child_stream_name))
                            write_schema(catalog, child_stream_name)
                            # For each parent record
                            for record in transformed_data:
                                i = 0
                                # Set parent_id
                                for id_field in id_fields:
                                    if i == 0:
                                        parent_id_field = id_field
                                    if id_field == 'id':
                                        parent_id_field = id_field
                                    i = i + 1
                                parent_id = record.get(parent_id_field)

                                # sync_endpoint for child
                                LOGGER.info(
                                    'START Sync for Stream: {}, parent_stream: {}, parent_id: {}'\
                                        .format(child_stream_name, stream_name, parent_id))

                                child_total_records = sync_endpoint(
                                    client=client,
                                    config=config,
                                    catalog=catalog,
                                    state=state,
                                    stream_name=child_stream_name,
                                    endpoint_config=child_endpoint_config,
                                    sync_streams=sync_streams,
                                    selected_streams=selected_streams,
                                    parent_id=parent_id)

                                LOGGER.info(
                                    'FINISHED Sync for Stream: {}, parent_id: {}, total_records: {}'\
                                        .format(child_stream_name, parent_id, child_total_records))
                                # End transformed data record loop
                            # End if child in sync_streams
                        # End child streams for parent
                    # End if children

                # Parent record batch
                total_records = total_records + record_count
                endpoint_total = endpoint_total + record_count

                LOGGER.info('Synced Stream: {}, records: {}'.format(
                    stream_name, total_records))

                # Update the state with the max_bookmark_value for the stream date window
                # Snapchat Ads API does not allow page/batch sorting; bookmark written for date window
                if bookmark_field and stream_name in selected_streams:
                    write_bookmark(state, stream_name, max_bookmark_value,
                                   bookmark_field, parent, parent_id)

        # Increment date window and sum endpoint_total
        start_window = end_window + timedelta(days=1)
        next_end_window = end_window + timedelta(days=date_window_size)
        if next_end_window > now_datetime:
            end_window = now_datetime
        else:
            end_window = next_end_window
        # End date window

    # Return total_records (for date windows)
    return endpoint_total
Esempio n. 16
0
def transform_report(report_name, report_data, account_id):
    time_series_length = int(report_data.get('time_series_length',
                                             1))  # Default = 1 to loop once
    request = report_data.get('request', {})

    # request params
    params = request.get('params', {})
    entity = params.get('entity')
    granularity = params.get('granularity')
    placement = params.get('placement')
    segmentation_type = params.get('segmentation_type')
    country = params.get('country')
    platform = params.get('platform')
    start_time = params.get('start_time')
    end_time = params.get('end_time')

    LOGGER.info(
        'Report: {} - transform_report, absolute start_time: {}'.format(
            report_name, start_time))
    LOGGER.info('Report: {} - transform_report, absoluted end_time: {}'.format(
        report_name, end_time))
    LOGGER.info('Report: {} - transform_report, time_series_length: {}'.format(
        report_name, time_series_length))

    report_records = []

    if granularity == 'DAY':
        interval = timedelta(days=1)
    elif granularity == 'HOUR':
        interval = timedelta(hours=1)
    elif granularity == 'TOTAL':
        interval = timedelta(days=0)  # 0 days for TOTAL

    # Loop through entity_id records w/ data
    for id_record in report_data.get('data'):
        # LOGGER.info('id_record = {}'.format(id_record)) # COMMENT OUT
        entity_id = id_record.get('id')
        # LOGGER.info('entity_id = {}'.format(entity_id)) # COMMENT OUT
        id_data = []
        id_data = id_record.get('id_data')

        # Loop through id_data records
        for datum in id_data:
            # Loop through time intervals
            start_dttm = strptime_to_utc(start_time)
            end_dttm = start_dttm + interval
            i = 0
            while i <= (time_series_length - 1):
                series_start = strftime(start_dttm)
                series_end = strftime(end_dttm)

                append_record = False  # Initialize; only append records w/ metric data
                segment = datum.get('segment')
                segment_name = None
                segment_value = None
                if segment:
                    segment_name = segment.get('segment_name')
                    segment_value = segment.get('segment_value')

                dimensions = {
                    'report_name': report_name,
                    'account_id': account_id,
                    'entity': entity,
                    'entity_id': entity_id,
                    'granularity': granularity,
                    'placement': placement,
                    'start_time': series_start,
                    'end_time': series_end,
                    'segmentation_type': segmentation_type,
                    'segment_name': segment_name,
                    'segment_value': segment_value,
                    'country': country,
                    'platform': platform
                }

                # Create MD5 hash key of sorted json dimesions (above)
                dims_md5 = str(
                    hash_data(json.dumps(dimensions, sort_keys=True)))
                record = {
                    '__sdc_dimensions_hash_key': dims_md5,
                    'start_time': series_start,
                    'end_time': series_end,
                    'dimensions': dimensions
                }

                # LOGGER.info('dimensions_hash_key = {}'.format(dims_md5)) # COMMENT OUT

                # Get time interval value from metrics value arrays
                metrics = datum.get('metrics', {})
                for key, val in list(metrics.items()):
                    # Determine nested object group for each measure
                    if key[0:7] == 'billed_':
                        group = 'billing'
                    elif key[0:6] == 'media_':
                        group = 'media'
                    elif key[0:6] == 'video_':
                        group = 'video'
                    elif key[0:11] == 'conversion_':
                        group = 'web_conversion'
                    elif key[0:18] == 'mobile_conversion_':
                        group = 'mobile_conversion'
                    else:
                        group = 'engagement'
                    # Create group node if not exists
                    if not record.get(group):
                        record[group] = {}

                    if isinstance(val, list):
                        index_val = None
                        try:
                            index_val = val[i]
                            record[group][key] = index_val
                            append_record = True
                        except IndexError:
                            index_val = None
                    elif isinstance(val, dict):
                        new_dict = {}
                        for key2, val2 in list(val.items()):
                            idx_val = None
                            if isinstance(val2, list):
                                try:
                                    idx_val = val2[i]
                                    new_dict[key2] = idx_val
                                    append_record = True
                                except IndexError:
                                    idx_val = None
                        if new_dict != {}:
                            record[group][key] = new_dict
                    # End for key, val in metrics

                # LOGGER.info('record = {}'.format(record)) # COMMENT OUT
                # LOGGER.info('append_record = {}'.format(append_record)) # COMMENT OUT
                if append_record:
                    report_records.append(record)
                i = i + 1
                start_dttm = end_dttm
                end_dttm = start_dttm + interval
                # End: while i < time_series_length

            # End: for datum in id_data

        # End: for id_record in report_data

    return report_records
Esempio n. 17
0
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        search_path,
        endpoint_config,
        git_owner,
        git_repository,
        bookmark_query_field=None,
        bookmark_field=None,
        data_key=None,
        id_fields=None,
        selected_streams=None):

    # Get the latest bookmark for the stream and set the last_datetime
    last_datetime = get_bookmark(state, stream_name, start_date)
    file_max_bookmark_value = last_datetime
    # Convert to GitHub date format, example: Sun, 13 Oct 2019 22:40:01 GMT
    last_dttm = strptime_to_utc(last_datetime)
    last_modified = last_dttm.strftime("%a, %d %b %Y %H:%M:%S %Z'")
    LOGGER.info('HEADER If-Modified-Since: {}'.format(last_modified))

    # Write schema and log selected fields for file stream and child lkml stream(s)
    write_schema(catalog, stream_name)
    selected_fields = get_selected_fields(catalog, stream_name)
    LOGGER.info('Stream: {}, selected_fields: {}'.format(
        stream_name, selected_fields))
    children = endpoint_config.get('children')
    if children:
        for child_stream_name, child_endpoint_config in children.items():
            if child_stream_name in selected_streams:
                write_schema(catalog, child_stream_name)
                child_selected_fields = get_selected_fields(
                    catalog, child_stream_name)
                LOGGER.info('Stream: {}, selected_fields: {}'.format(
                    child_stream_name, child_selected_fields))

    # pagination: loop thru all pages of data using next_url (if not None)
    page = 1
    offset = 0
    file_total_records = 0
    lkml_total_records = 0
    next_url = '{}/{}'.format(client.base_url, search_path)

    i = 1
    while next_url is not None:
        LOGGER.info('Search URL for Stream {}: {}'.format(
            stream_name, next_url))

        # API request search_data
        search_data = {}
        search_data, next_url = client.get(url=next_url, endpoint=stream_name)

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        search_items = search_data.get(data_key)
        if not search_items:
            break  # No data results

        file_count = 0
        file_records = []
        lkml_records = []
        for item in search_items:
            file_count = file_count + 1
            file_url = item.get('url')
            LOGGER.info('File URL for Stream {}: {}'.format(
                stream_name, file_url))
            file_data = {}
            headers = {}
            if bookmark_query_field:
                headers[bookmark_query_field] = last_modified
            # API request file_data for item, single-file (ignore file_next_url)
            file_data, file_next_url = client.get(url=file_url,
                                                  headers=headers,
                                                  endpoint=stream_name)
            # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT

            if file_data:
                content = file_data.get('content')
                content_dict = {}
                if content:
                    content_b64 = base64.b64decode(content)
                    content_str = content_b64.decode('utf-8')
                    content_dict = lkml.load(content_str)

                file_modified = file_data.get('last_modified')
                file_sha = file_data.get('sha')
                file_path = file_data.get('path')

                # Remove _links, content nodes, add git info
                file_data.pop('_links', None)
                file_data.pop('content', None)
                file_data['git_owner'] = git_owner
                file_data['git_repository'] = git_repository
                # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT
                file_records.append(file_data)

                # Loop thru each child object and append lkml records
                if children:
                    for child_stream_name, child_endpoint_config in children.items(
                    ):
                        if child_stream_name in selected_streams:
                            child_data_key = child_endpoint_config.get(
                                'data_key')
                            if child_data_key and child_data_key in content_dict:
                                for record in content_dict.get(
                                        child_data_key, []):
                                    record['path'] = file_path
                                    record['sha'] = file_sha
                                    record['last_modified'] = file_modified
                                    record['git_owner'] = git_owner
                                    record['git_repository'] = git_repository
                                    lkml_records.append(record)
                            else:
                                content_dict['path'] = file_path
                                content_dict['sha'] = file_sha
                                content_dict['last_modified'] = file_modified
                                content_dict['git_owner'] = git_owner
                                content_dict['git_repository'] = git_repository
                                lkml_records.append(content_dict)

        # Process file_records and get the max_bookmark_value and record_count
        file_max_bookmark_value, file_record_count = process_records(
            catalog=catalog,
            stream_name=stream_name,
            records=file_records,
            time_extracted=time_extracted,
            bookmark_field=bookmark_field,
            max_bookmark_value=file_max_bookmark_value,
            last_datetime=last_datetime)
        LOGGER.info('Stream {}, batch processed {} records'.format(
            stream_name, file_record_count))
        file_total_records = file_total_records + file_record_count

        # Loop thru each child object to process lkml records
        if children:
            for child_stream_name, child_endpoint_config in children.items():
                if child_stream_name in selected_streams:
                    lkml_max_bookmark_value, lkml_record_count = process_records(
                        catalog=catalog,
                        stream_name=child_stream_name,
                        records=lkml_records,
                        time_extracted=time_extracted,
                        bookmark_field=None,
                        max_bookmark_value=None,
                        last_datetime=last_datetime)
                    LOGGER.info('Stream {}, batch processed {} records'.format(
                        child_stream_name, lkml_record_count))
                    lkml_total_records = lkml_total_records + lkml_record_count

        # Update the state with the max_bookmark_value for the stream
        if bookmark_field:
            write_bookmark(state, stream_name, file_max_bookmark_value)

        # to_rec: to record; ending record for the batch page
        to_rec = offset + file_count
        LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format(
            stream_name, page, offset, to_rec))
        # Pagination: increment the offset by the limit (batch-size) and page
        offset = offset + file_count
        page = page + 1
        i = i + 1

    # Return total_records across all pages
    LOGGER.info(
        'Synced Stream: {}, TOTAL pages: {}, file records: {}, lookml records: {}'
        .format(stream_name, page - 1, file_total_records, lkml_total_records))
    return file_total_records
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        path,
        endpoint_config,
        bookmark_field=None,
        project_timezone=None,
        days_interval=None,
        attribution_window=None,
        export_events=None,
        denest_properties_flag=None):

    # Get endpoint_config fields
    url = endpoint_config.get('url')
    data_key = endpoint_config.get('data_key', 'results')
    api_method = endpoint_config.get('api_method')
    parent_path = endpoint_config.get('parent_path')
    parent_id_field = endpoint_config.get('parent_id_field')
    static_params = endpoint_config.get('params', {})
    bookmark_query_field_from = endpoint_config.get(
        'bookmark_query_field_from')
    bookmark_query_field_to = endpoint_config.get('bookmark_query_field_to')
    id_fields = endpoint_config.get('key_properties')
    date_dictionary = endpoint_config.get('date_dictionary', False)
    pagination = endpoint_config.get('pagination', False)

    # Get the latest bookmark for the stream and set the last_integer/datetime
    last_datetime = None
    max_bookmark_value = None
    last_datetime = get_bookmark(state, stream_name, start_date)
    max_bookmark_value = last_datetime

    write_schema(catalog, stream_name)

    # windowing: loop through date days_interval date windows from last_datetime to now_datetime
    tzone = pytz.timezone(project_timezone)
    now_datetime = datetime.now(tzone)

    if bookmark_query_field_from and bookmark_query_field_to:
        # days_interval from config date_window_size, default = 60; passed to function from sync
        if not days_interval:
            days_interval = 30

        last_dttm = strptime_to_utc(last_datetime)
        delta_days = (now_datetime - last_dttm).days
        if delta_days <= attribution_window:
            delta_days = attribution_window
            LOGGER.info(
                "Start bookmark less than {} day attribution window.".format(
                    attribution_window))
        elif delta_days >= 365:
            delta_days = 365
            LOGGER.warning(
                "WARNING: Start date or bookmark greater than 1 year maxiumum."
            )
            LOGGER.warning("WARNING: Setting bookmark start to 1 year ago.")

        start_window = now_datetime - timedelta(days=delta_days)
        end_window = start_window + timedelta(days=days_interval)
        if end_window > now_datetime:
            end_window = now_datetime
    else:
        start_window = strptime_to_utc(last_datetime)
        end_window = now_datetime
        diff_sec = (end_window - start_window).seconds
        days_interval = math.ceil(diff_sec /
                                  (3600 * 24))  # round-up difference to days

    # LOOP order: Date Windows, Parent IDs, Page
    # Initialize counter
    endpoint_total = 0  # Total for ALL: parents, date windows, and pages

    # Begin date windowing loop
    while start_window < now_datetime:
        # Initialize counters
        date_total = 0  # Total records for a date window
        parent_total = 0  # Total records for parent ID
        total_records = 0  # Total records for all pages
        record_count = 0  # Total processed for page

        params = static_params  # adds in endpoint specific, sort, filter params

        if bookmark_query_field_from and bookmark_query_field_to:
            # Request dates need to be normalized to project timezone or else errors may occur
            # Errors occur when from_date is > 365 days ago
            #   and when to_date > today (in project timezone)
            from_date = '{}'.format(start_window.astimezone(tzone))[0:10]
            to_date = '{}'.format(end_window.astimezone(tzone))[0:10]
            LOGGER.info('START Sync for Stream: {}{}'.format(
                stream_name,
                ', Date window from: {} to {}'.format(from_date, to_date) \
                    if bookmark_query_field_from else ''))
            params[bookmark_query_field_from] = from_date
            params[bookmark_query_field_to] = to_date

        # funnels and cohorts have a parent endpoint with parent_data and parent_id_field
        if parent_path and parent_id_field:
            # API request data
            LOGGER.info('URL for Parent Stream {}: {}/{}'.format(
                stream_name, url, parent_path))
            parent_data = client.request(method='GET',
                                         url=url,
                                         path=parent_path,
                                         endpoint='parent_data')
        # Other endpoints (not funnels, cohorts): Simulate parent_data with single record
        else:
            parent_data = [{'id': 'none'}]
            parent_id_field = 'id'

        for parent_record in parent_data:
            parent_id = parent_record.get(parent_id_field)
            LOGGER.info('START: Stream: {}, parent_id: {}'.format(
                stream_name, parent_id))

            # pagination: loop thru all pages of data using next (if not None)
            page = 0  # First page is page=0, second page is page=1, ...
            offset = 0
            limit = 250  # Default page_size
            # Initialize counters
            parent_total = 0  # Total records for parent ID
            total_records = 0  # Total records for all pages
            record_count = 0  # Total processed for page

            session_id = 'initial'
            if pagination:
                params['page_size'] = limit

            while offset <= total_records and session_id is not None:
                if pagination and page != 0:
                    params['session_id'] = session_id
                    params['page'] = page

                # querystring: Squash query params into string and replace [parent_id]
                querystring = '&'.join(['%s=%s' % (key, value) for (key, value) \
                    in params.items()]).replace(
                        '[parent_id]', str(parent_id))

                if stream_name == 'export' and export_events:
                    event = json.dumps([export_events] if isinstance(
                        export_events, str) else export_events)
                    url_encoded = urllib.parse.quote(event)
                    querystring += f'&event={url_encoded}'

                full_url = '{}/{}{}'.format(
                    url, path,
                    '?{}'.format(querystring) if querystring else '')

                LOGGER.info('URL for Stream {}: {}'.format(
                    stream_name, full_url))

                # API request data
                data = {}

                # Export has a streaming api call
                if stream_name == 'export':
                    data = client.request_export(method=api_method,
                                                 url=url,
                                                 path=path,
                                                 params=querystring,
                                                 endpoint=stream_name)

                    # time_extracted: datetime when the data was extracted from the API
                    time_extracted = utils.now()
                    transformed_data = []
                    for record in data:
                        if record and str(record) != '':
                            # transform reocord and append to transformed_data array
                            transformed_record = transform_record(record, stream_name, \
                                project_timezone, denest_properties_flag)
                            transformed_data.append(transformed_record)

                            # Check for missing keys
                            for key in id_fields:
                                val = transformed_record.get(key)
                                if val == '' or not val:
                                    LOGGER.error('Error: Missing Key')
                                    raise 'Missing Key'

                            if len(transformed_data) == limit:
                                # Process full batch (limit = 250) records
                                #   and get the max_bookmark_value and record_count
                                max_bookmark_value, record_count = process_records(
                                    catalog=catalog,
                                    stream_name=stream_name,
                                    records=transformed_data,
                                    time_extracted=time_extracted,
                                    bookmark_field=bookmark_field,
                                    max_bookmark_value=max_bookmark_value,
                                    last_datetime=last_datetime)
                                total_records = total_records + record_count
                                parent_total = parent_total + record_count
                                date_total = date_total + record_count
                                endpoint_total = endpoint_total + record_count
                                transformed_data = []

                                LOGGER.info(
                                    'Stream {}, batch processed {} records, total {}, max bookmark {}'
                                    .format(stream_name, record_count,
                                            endpoint_total,
                                            max_bookmark_value))
                                # End if (batch = limit 250)
                            # End if record
                        # End has export_data records loop

                    # Process remaining, partial batch
                    if len(transformed_data) > 0:
                        max_bookmark_value, record_count = process_records(
                            catalog=catalog,
                            stream_name=stream_name,
                            records=transformed_data,
                            time_extracted=time_extracted,
                            bookmark_field=bookmark_field,
                            max_bookmark_value=max_bookmark_value,
                            last_datetime=last_datetime)
                        LOGGER.info(
                            'Stream {}, batch processed {} records'.format(
                                stream_name, record_count))

                        total_records = total_records + record_count
                        parent_total = parent_total + record_count
                        date_total = date_total + record_count
                        endpoint_total = endpoint_total + record_count
                        # End if transformed_data

                    # Export does not provide pagination; session_id = None breaks out of loop.
                    session_id = None
                    # End export stream API call

                else:  # stream_name != 'export`
                    data = client.request(method=api_method,
                                          url=url,
                                          path=path,
                                          params=querystring,
                                          endpoint=stream_name)

                    # time_extracted: datetime when the data was extracted from the API
                    time_extracted = utils.now()
                    if not data or data is None or data == {} or data == []:
                        LOGGER.info('No data for URL: {}'.format(full_url))
                        # No data results
                    else:  # has data
                        # Transform data with transform_json from transform.py
                        # The data_key identifies the array/list of records below the <root> element
                        # LOGGER.info('data = {}'.format(data)) # TESTING, comment out
                        transformed_data = []  # initialize the record list

                        # Endpoints: funnels, revenue return results as dictionary for each date
                        # Standardize results to a list/array
                        if date_dictionary and data_key in data:
                            results = {}
                            results_list = []
                            for key, val in data[data_key].items():
                                # skip $overall summary
                                if key != '$overall':
                                    val['date'] = key
                                    val['datetime'] = '{}T00:00:00Z'.format(
                                        key)
                                    results_list.append(val)
                            results[data_key] = results_list
                            data = results

                        # Cohorts endpoint returns results as a list/array (no data_key)
                        # All other endpoints have a data_key
                        if data_key is None or data_key == '.':
                            data_key = 'results'
                            new_data = {'results': data}
                            data = new_data

                        transformed_data = []
                        # Loop through result records
                        for record in data[data_key]:
                            # transform reocord and append to transformed_data array
                            transformed_record = transform_record(
                                record, stream_name, project_timezone,
                                parent_record)
                            transformed_data.append(transformed_record)

                            # Check for missing keys
                            for key in id_fields:
                                val = transformed_record.get(key)
                                if val == '' or not val:
                                    LOGGER.error('Error: Missing Key')
                                    raise 'Missing Key'

                            # End data record loop

                        if not transformed_data or transformed_data is None or \
                            transformed_data == []:
                            LOGGER.info(
                                'No transformed data for data = {}'.format(
                                    data))
                            # No transformed data results
                        else:  # has transformed data
                            # Process records and get the max_bookmark_value and record_count
                            max_bookmark_value, record_count = process_records(
                                catalog=catalog,
                                stream_name=stream_name,
                                records=transformed_data,
                                time_extracted=time_extracted,
                                bookmark_field=bookmark_field,
                                max_bookmark_value=max_bookmark_value,
                                last_datetime=last_datetime)
                            LOGGER.info(
                                'Stream {}, batch processed {} records'.format(
                                    stream_name, record_count))

                            # set total_records and pagination fields
                            if page == 0:
                                if isinstance(data, dict):
                                    total_records = data.get(
                                        'total', record_count)
                                else:
                                    total_records = record_count
                            parent_total = parent_total + record_count
                            date_total = date_total + record_count
                            endpoint_total = endpoint_total + record_count
                            if isinstance(data, dict):
                                session_id = data.get('session_id', None)

                            # to_rec: to record; ending record for the batch page
                            if pagination:
                                to_rec = offset + limit
                                if to_rec > total_records:
                                    to_rec = total_records
                            else:
                                to_rec = record_count

                            LOGGER.info(
                                'Synced Stream: {}, page: {}, {} to {} of total: {}'
                                .format(stream_name, page, offset, to_rec,
                                        total_records))
                            # End has transformed data
                        # End has data results

                    # Pagination: increment the offset by the limit (batch-size) and page
                    offset = offset + limit
                    page = page + 1
                    # End page/batch loop
                # End stream != 'export'
            LOGGER.info('FINISHED: Stream: {}, parent_id: {}'.format(
                stream_name, parent_id))
            LOGGER.info('  Total records for parent: {}'.format(parent_total))
            # End parent record loop

        LOGGER.info('FINISHED Sync for Stream: {}{}'.format(
            stream_name,
            ', Date window from: {} to {}'.format(from_date, to_date) \
                if bookmark_query_field_from else ''))
        LOGGER.info('  Total records for date window: {}'.format(date_total))
        # Increment date window
        start_window = end_window
        next_end_window = end_window + timedelta(days=days_interval)
        if next_end_window > now_datetime:
            end_window = now_datetime
        else:
            end_window = next_end_window

        # Update the state with the max_bookmark_value for the stream
        if bookmark_field:
            write_bookmark(state, stream_name, max_bookmark_value)

        # End date window loop

    # Return endpoint_total across all batches
    return endpoint_total
Esempio n. 19
0
def process_records(
        catalog,  #pylint: disable=too-many-branches
        stream_name,
        records,
        time_extracted,
        bookmark_field=None,
        bookmark_type=None,
        max_bookmark_value=None,
        last_datetime=None,
        last_integer=None,
        parent=None,
        parent_id=None):
    stream = catalog.get_stream(stream_name)
    schema = stream.schema.to_dict()
    stream_metadata = metadata.to_map(stream.metadata)

    with metrics.record_counter(stream_name) as counter:
        for record in records:
            # If child object, add parent_id to record
            if parent_id and parent:
                record[parent + '_id'] = parent_id

            # Transform record for Singer.io
            with Transformer() as transformer:
                transformed_record = transformer.transform(
                    record, schema, stream_metadata)

                # Reset max_bookmark_value to new value if higher
                if bookmark_field and (bookmark_field in transformed_record):
                    bookmark_dttm = strptime_to_utc(
                        transformed_record[bookmark_field])
                    if max_bookmark_value:
                        max_bookmark_value_dttm = strptime_to_utc(
                            max_bookmark_value)
                        if bookmark_dttm > max_bookmark_value_dttm:
                            max_bookmark_value = transformed_record[
                                bookmark_field]
                    else:
                        max_bookmark_value = transformed_record[bookmark_field]

                if bookmark_field and (bookmark_field in transformed_record):
                    if bookmark_type == 'integer':
                        # Keep only records whose bookmark is after the last_integer
                        if transformed_record[bookmark_field] >= last_integer:
                            write_record(stream_name,
                                         transformed_record,
                                         time_extracted=time_extracted)
                            counter.increment()
                    elif bookmark_type == 'datetime':
                        last_dttm = transform_datetime(last_datetime)
                        bookmark_dttm = transform_datetime(
                            transformed_record[bookmark_field])
                        # Keep only records whose bookmark is after the last_datetime
                        if bookmark_dttm >= last_dttm:
                            write_record(stream_name,
                                         transformed_record,
                                         time_extracted=time_extracted)
                            counter.increment()
                else:
                    write_record(stream_name,
                                 transformed_record,
                                 time_extracted=time_extracted)
                    counter.increment()

        return max_bookmark_value, len(records)
Esempio n. 20
0
def advance_bookmark(worklogs):
    raise_if_bookmark_cannot_advance(worklogs)
    new_last_updated = max(
        utils.strptime_to_utc(w["updated"]) for w in worklogs)
    return new_last_updated
Esempio n. 21
0
    def update_bookmark(self, state, value):
        current_bookmark = self.get_bookmark(state)

        if value and utils.strptime_to_utc(value) > current_bookmark:
            write_bookmark(state, self.name, self.replication_key, value)
Esempio n. 22
0
def sync_endpoint(
        client,  #pylint: disable=too-many-branches
        catalog,
        state,
        start_date,
        stream_name,
        search_path,
        endpoint_config,
        bookmark_field=None,
        selected_streams=None):

    # Endpoint parameters
    bookmark_query_field = endpoint_config.get('bookmark_query_field', None)
    data_key = endpoint_config.get('data_key', stream_name)
    exclude_files = endpoint_config.get('exclude_files', [])
    csv_delimiter = endpoint_config.get('csv_delimiter', ',')
    skip_header_rows = endpoint_config.get('skip_header_rows', 0)
    activate_version_ind = endpoint_config.get('activate_version', False)
    alt_character_set = endpoint_config.get('alt_character_set', 'utf-8')
    # LOGGER.info('data_key = {}'.format(data_key))

    # Get the latest bookmark for the stream and set the last_datetime
    last_datetime = get_bookmark(state, stream_name, start_date)
    last_dttm = strptime_to_utc(last_datetime)
    timezone = pytz.timezone('UTC')
    bookmark_dttm = utils.now()  # Initialize bookmark_dttn
    max_bookmark_value = None

    # Convert to GitHub date format, example: Sun, 13 Oct 2019 22:40:01 GMT
    last_modified = last_dttm.strftime("%a, %d %b %Y %H:%M:%S %Z'")
    LOGGER.info('HEADER If-Modified-Since: {}'.format(last_modified))

    # Write schema and log selected fields for stream
    write_schema(catalog, stream_name)
    selected_fields = get_selected_fields(catalog, stream_name)
    LOGGER.info('Stream: {}, selected_fields: {}'.format(
        stream_name, selected_fields))

    # pagination: loop thru all pages of data using next_url (if not None)
    page = 1
    offset = 0
    file_count = 0
    total_records = 0
    next_url = '{}/{}'.format(client.base_url, search_path)

    # Loop through all search items pages (while there are more pages, next_url)
    #   and until bookmark_dttm < last_dttm
    first_record = True
    while next_url is not None and bookmark_dttm >= last_dttm:
        LOGGER.info('Search URL for Stream {}: {}'.format(
            stream_name, next_url))

        # API request search_data
        search_data = {}
        search_data, next_url, search_last_modified = client.get(
            url=next_url, endpoint=stream_name)
        LOGGER.info('next_url = {}'.format(next_url))
        # LOGGER.info('search_data = {}'.format(search_data)) # COMMENT OUT

        # time_extracted: datetime when the data was extracted from the API
        time_extracted = utils.now()
        search_items = search_data.get(data_key, [])
        if not search_items:
            LOGGER.info('Stream: {}, no files found'.format(stream_name))
            break  # No data results

        i = 0  # i = search item number
        item_total = len(search_items)
        # Loop through all search items until bookmark_dttm < last_dttm
        while i <= (item_total - 1) and bookmark_dttm >= last_dttm:
            item = search_items[i]
            file_name = item.get('name')
            # Skip excluded files
            if file_name in exclude_files:
                i = i + 1
                if i > (item_total - 1):
                    break
                else:
                    item = search_items[i]
            csv_records = []
            file_count = file_count + 1
            # url (content url) is preferable to git_url (blob url) b/c it provides
            #   last-modified header for bookmark
            # However, git_url allows for up to 100 MB files; url allows for up to 1 MB files
            # Therefore, we use the git_url (blob) endpoint
            # And make another call to the commits endpoint to get last-modified
            file_url = item.get('git_url')
            git_repository = item.get('repository', {}).get('name')
            git_owner = item.get('repository', {}).get('owner',
                                                       {}).get('login')
            file_path = item.get('path')
            file_sha = item.get('sha')
            file_name = item.get('name')
            file_html_url = item.get('html_url')

            headers = {}
            if bookmark_query_field:
                headers[bookmark_query_field] = last_modified
            # API request commits_data for single-file, to get file last_modified
            commit_url = '{}/repos/{}/{}/commits?path={}'.format(
                client.base_url, git_owner, git_repository, file_path)
            LOGGER.info('Commit URL for Stream {}: {}'.format(
                stream_name, commit_url))
            commit_data, commits_next_url, commit_last_modified = client.get(
                url=commit_url,
                headers=headers,
                endpoint='{}_commits'.format(stream_name))

            # Bookmarking: search data (and commit data) sorted by last-modified desc
            # 1st item on 1st page sets max_bookmark_value = last-modified
            bookmark_dttm = strptime_to_utc(commit_last_modified)
            if first_record and bookmark_dttm > last_dttm:
                max_bookmark_value = commit_last_modified
                max_bookmark_dttm = bookmark_dttm
                max_bookmark_epoch = int(
                    (max_bookmark_dttm -
                     timezone.localize(datetime(1970, 1, 1))).total_seconds())

                # For some streams (activate_version = True):
                # Emit a Singer ACTIVATE_VERSION message before initial sync (but not subsequent syncs)
                # everytime after each sheet sync is complete.
                # This forces hard deletes on the data downstream if fewer records are sent.
                # https://github.com/singer-io/singer-python/blob/master/singer/messages.py#L137
                if activate_version_ind:
                    if last_datetime == start_date:
                        activate_version = 0
                    else:
                        activate_version = max_bookmark_epoch
                    activate_version_message = singer.ActivateVersionMessage(
                        stream=stream_name, version=activate_version)
                    if last_datetime == start_date:
                        # initial load, send activate_version before AND after data sync
                        singer.write_message(activate_version_message)
                        LOGGER.info(
                            'INITIAL SYNC, Stream: {}, Activate Version: {}'.
                            format(stream_name, activate_version))
                else:
                    activate_version = None
                # End: if first_record and bookmark_dttm > last_dttm

            if commit_data and bookmark_dttm >= last_dttm:
                # API request file_data for item, single-file (ignore file_next_url)
                file_data = {}
                headers = {}
                LOGGER.info('File URL for Stream {}: {}'.format(
                    stream_name, file_url))
                file_data, file_next_url, file_last_modified = client.get(
                    url=file_url, headers=headers, endpoint=stream_name)
                # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT

                if file_data:
                    # Read, decode, and parse content blob to json
                    content = file_data.get('content')
                    content_list = []
                    if content:
                        content_b64 = base64.b64decode(content)
                        # Italian files typically use character_set: utf-8
                        #  However, some newer files use character_set: latin_1
                        # All other files use character_set: utf-8 (default)
                        try:
                            content_str = content_b64.decode('utf-8')
                        except UnicodeDecodeError as err:
                            LOGGER.warning(
                                'UTF-8 UNICODE DECODE ERROR: {}'.format(err))
                            # Try decoding with Alternate Character Set (from streams.py)
                            content_str = content_b64.decode(alt_character_set)
                        content_array = content_str.splitlines()
                        content_array_sliced = content_array[skip_header_rows:]
                        reader = csv.DictReader(content_array_sliced,
                                                delimiter=csv_delimiter)
                        content_list = [r for r in reader]

                    LOGGER.info('Retrieved file_name: {}'.format(file_name))

                    # LOGGER.info('file_data: {}'.format(file_data)) # TESTING ONLY - COMMENT OUT

                    # Loop thru and append csv records
                    row_number = 1
                    for record in content_list:
                        record['git_owner'] = git_owner
                        record['git_repository'] = git_repository
                        record['git_url'] = file_url
                        record['git_html_url'] = file_html_url
                        record['git_path'] = file_path
                        record['git_sha'] = file_sha
                        record['git_file_name'] = file_name
                        record['git_last_modified'] = commit_last_modified
                        record['__sdc_row_number'] = row_number

                        # Transform record and append
                        transformed_csv_record = {}
                        try:
                            transformed_csv_record = transform_record(
                                stream_name, record)
                        except Exception as err:
                            LOGGER.error(
                                'Transform Record error: {}, Strean: {}'.
                                format(err, stream_name))
                            LOGGER.error('record: {}'.format(record))
                            raise err

                        # Bad records and totals
                        if transformed_csv_record is None:
                            continue

                        csv_records.append(transformed_csv_record)
                        row_number = row_number + 1
                    # End If file_data

                record_count = process_records(catalog=catalog,
                                               stream_name=stream_name,
                                               records=csv_records,
                                               time_extracted=time_extracted,
                                               version=activate_version)
                LOGGER.info('Stream {}, batch processed {} records'.format(
                    stream_name, record_count))
                total_records = total_records + record_count
                # End if commit_data
            first_record = False
            i = i + 1  # Next search item record
            # End: while i <= (item_total - 1) and bookmark_dttm >= last_dttm

        # to_rec: to record; ending record for the batch page
        to_rec = offset + file_count
        LOGGER.info('Synced Stream: {}, page: {}, records: {} to {}'.format(
            stream_name, page, offset, to_rec))
        # Pagination: increment the offset by the limit (batch-size) and page
        offset = offset + file_count
        page = page + 1
        # End: next_url is not None and bookmark_dttm >= last_dttm

    if file_count > 0 and max_bookmark_value:
        # End of Stream: Send Activate Version (if needed) and update State
        if activate_version_ind:
            singer.write_message(activate_version_message)
        write_bookmark(state, stream_name, max_bookmark_value)
    else:
        LOGGER.warning('NO NEW DATA FOR STREAM: {}'.format(stream_name))
        write_bookmark(state, stream_name, last_datetime)

    # Return total_records across all pages
    LOGGER.info(
        'Synced Stream: {}, TOTAL pages: {}, file count: {}, total records: {}'
        .format(stream_name, page - 1, file_count, total_records))
    return total_records
Esempio n. 23
0
def sync_stream(stream_name):
    """
    Sync each stream, looking for newly created records. Updates are captured by events stream.
    """
    LOGGER.info("Started syncing stream %s", stream_name)

    stream_metadata = metadata.to_map(
        Context.get_catalog_entry(stream_name)['metadata'])
    stream_field_whitelist = json.loads(
        Context.config.get('whitelist_map', '{}')).get(stream_name)

    extraction_time = singer.utils.now()
    replication_key = metadata.get(stream_metadata, (),
                                   'valid-replication-keys')[0]
    # Invoice Items bookmarks on `date`, but queries on `created`
    filter_key = 'created' if stream_name == 'invoice_items' else replication_key
    stream_bookmark = singer.get_bookmark(Context.state, stream_name, replication_key) or \
        int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())
    bookmark = stream_bookmark

    # if this stream has a sub_stream, compare the bookmark
    sub_stream_name = SUB_STREAMS.get(stream_name)

    # If there is a sub-stream and its selected, get its bookmark (or the start date if no bookmark)
    should_sync_sub_stream = sub_stream_name and Context.is_selected(
        sub_stream_name)
    if should_sync_sub_stream:
        sub_stream_bookmark = singer.get_bookmark(Context.state, sub_stream_name, replication_key) \
            or int(utils.strptime_to_utc(Context.config["start_date"]).timestamp())

        # if there is a sub stream, set bookmark to sub stream's bookmark
        # since we know it must be earlier than the stream's bookmark
        if sub_stream_bookmark != stream_bookmark:
            bookmark = sub_stream_bookmark
    else:
        sub_stream_bookmark = None

    with Transformer(
            singer.UNIX_SECONDS_INTEGER_DATETIME_PARSING) as transformer:
        end_time = dt_to_epoch(utils.now())
        window_size = int(
            Context.config.get('date_window_size', DEFAULT_DATE_WINDOW_SIZE))
        if DEFAULT_DATE_WINDOW_SIZE != window_size:
            LOGGER.info('Using non-default date window size of %d',
                        window_size)
        start_window = bookmark

        # NB: Immutable streams are never synced for updates. We've
        # observed a short lag period between when records are created and
        # when they are available via the API, so these streams will need
        # a short lookback window.
        if stream_name in IMMUTABLE_STREAMS:
            # pylint:disable=fixme
            # TODO: This may be an issue for other streams' created_at
            # entries, but to keep the surface small, doing this only for
            # immutable streams at first to confirm the suspicion.
            start_window -= IMMUTABLE_STREAM_LOOKBACK

        # NB: We observed records coming through newest->oldest and so
        # date-windowing was added and the tap only bookmarks after it has
        # gotten through a date window
        while start_window < end_time:
            stop_window = dt_to_epoch(
                epoch_to_dt(start_window) + timedelta(days=window_size))
            # cut off the last window at the end time
            if stop_window > end_time:
                stop_window = end_time

            for stream_obj in paginate(
                    STREAM_SDK_OBJECTS[stream_name]['sdk_object'], filter_key,
                    start_window, stop_window):

                # get the replication key value from the object
                rec = unwrap_data_objects(stream_obj.to_dict_recursive())
                rec = reduce_foreign_keys(rec, stream_name)
                stream_obj_created = rec[replication_key]
                rec['updated'] = stream_obj_created

                # sync stream if object is greater than or equal to the bookmark
                if stream_obj_created >= stream_bookmark:
                    rec = transformer.transform(
                        rec,
                        Context.get_catalog_entry(stream_name)['schema'],
                        stream_metadata)

                    # At this point, the record has been transformed and so
                    # any de-selected fields have been pruned. Now, prune off
                    # any fields that aren't present in the whitelist.
                    if stream_field_whitelist:
                        rec = apply_whitelist(rec, stream_field_whitelist)

                    singer.write_record(stream_name,
                                        rec,
                                        time_extracted=extraction_time)

                    Context.new_counts[stream_name] += 1

                # sync sub streams if its selected and the parent object
                # is greater than its bookmark
                if should_sync_sub_stream and stream_obj_created > sub_stream_bookmark:
                    sync_sub_stream(sub_stream_name, stream_obj)

            # Update stream/sub-streams bookmarks as stop window
            if stop_window > stream_bookmark:
                stream_bookmark = stop_window
                singer.write_bookmark(Context.state, stream_name,
                                      replication_key, stream_bookmark)

            # the sub stream bookmarks on its parent
            if should_sync_sub_stream and stop_window > sub_stream_bookmark:
                sub_stream_bookmark = stop_window
                singer.write_bookmark(Context.state, sub_stream_name,
                                      replication_key, sub_stream_bookmark)

            singer.write_state(Context.state)

            # update window for next iteration
            start_window = stop_window

    singer.write_state(Context.state)
Esempio n. 24
0
def sync_transactions():
    schema = load_schema("transactions")

    singer.write_schema("transactions",
                        schema, ["id"],
                        bookmark_properties=['created_at'])

    latest_updated_at = utils.strptime_to_utc(
        STATE.get('latest_updated_at', DEFAULT_TIMESTAMP))

    run_maximum_updated_at = latest_updated_at

    latest_disbursement_date = utils.strptime_to_utc(
        STATE.get('latest_disbursment_date', DEFAULT_TIMESTAMP))

    run_maximum_disbursement_date = latest_disbursement_date

    latest_start_date = utils.strptime_to_utc(get_start("transactions"))

    period_start = latest_start_date - TRAILING_DAYS

    period_end = utils.now()

    logger.info("transactions: Syncing from {}".format(period_start))

    logger.info(
        "transactions: latest_updated_at from {}, disbursement_date from {}".
        format(latest_updated_at, latest_disbursement_date))

    logger.info(
        "transactions: latest_start_date from {}".format(latest_start_date))

    # increment through each day (20k results max from api)
    for start, end in daterange(period_start, period_end):

        end = min(end, period_end)

        data = braintree.Transaction.search(
            braintree.TransactionSearch.created_at.between(start, end))
        time_extracted = utils.now()

        logger.info("transactions: Fetched {} records from {} - {}".format(
            data.maximum_size, start, end))

        row_written_count = 0
        row_skipped_count = 0

        for row in data:
            # Ensure updated_at consistency
            if not getattr(row, 'updated_at'):
                row.updated_at = row.created_at

            transformed = transform_row(row, schema)
            updated_at = to_utc(row.updated_at)

            # if disbursement is successful, get disbursement date
            # set disbursement datetime to min if not found

            if row.disbursement_details is None:
                disbursement_date = datetime.min

            else:
                if row.disbursement_details.disbursement_date is None:
                    row.disbursement_details.disbursement_date = datetime.min

                disbursement_date = to_utc(
                    datetime.combine(
                        row.disbursement_details.disbursement_date,
                        datetime.min.time()))

            # Is this more recent than our past stored value of update_at?
            # Is this more recent than our past stored value of disbursement_date?
            # Use >= for updated_at due to non monotonic updated_at values
            # Use > for disbursement_date - confirming all transactions disbursed
            # at the same time
            # Update our high water mark for updated_at and disbursement_date
            # in this run
            if (updated_at >= latest_updated_at) or (disbursement_date >=
                                                     latest_disbursement_date):

                if updated_at > run_maximum_updated_at:
                    run_maximum_updated_at = updated_at

                if disbursement_date > run_maximum_disbursement_date:
                    run_maximum_disbursement_date = disbursement_date

                singer.write_record("transactions",
                                    transformed,
                                    time_extracted=time_extracted)
                row_written_count += 1

            else:

                row_skipped_count += 1

        logger.info("transactions: Written {} records from {} - {}".format(
            row_written_count, start, end))

        logger.info("transactions: Skipped {} records from {} - {}".format(
            row_skipped_count, start, end))

    # End day loop
    logger.info("transactions: Complete. Last updated record: {}".format(
        run_maximum_updated_at))

    logger.info("transactions: Complete. Last disbursement date: {}".format(
        run_maximum_disbursement_date))

    latest_updated_at = run_maximum_updated_at

    latest_disbursement_date = run_maximum_disbursement_date

    STATE['latest_updated_at'] = utils.strftime(latest_updated_at)

    STATE['latest_disbursement_date'] = utils.strftime(
        latest_disbursement_date)

    utils.update_state(STATE, "transactions", utils.strftime(end))

    singer.write_state(STATE)
Esempio n. 25
0
def sync(client, config, catalog, state):
    LOGGER.info('Starting Sync..')
    selected_streams = catalog.get_selected_streams(state)

    streams = []
    stream_keys = []
    with Transformer() as transformer:
        for catalog_entry in selected_streams:
            streams.append(catalog_entry)
            stream_keys.append(catalog_entry.stream)

        for catalog_entry in streams:
            stream = AVAILABLE_STREAMS[catalog_entry.stream](client=client,
                                                             config=config,
                                                             catalog=catalog,
                                                             state=state)
            LOGGER.info('Syncing stream: %s', catalog_entry.stream)

            stream.update_currently_syncing(stream.name)
            stream.write_state()
            stream_schema = catalog_entry.schema.to_dict()
            stream.write_schema()
            stream_metadata = metadata.to_map(catalog_entry.metadata)

            bookmark_date = stream.get_bookmark(stream.name,
                                                config['start_date'])
            bookmark_dttm = strptime_to_utc(bookmark_date)
            max_bookmark_value = None

            with singer.metrics.record_counter(
                    endpoint=stream.name) as counter:
                if stream.replication_method == 'FULL_TABLE':
                    for page in stream.sync(client):
                        for record in page:
                            singer.write_record(
                                catalog_entry.stream,
                                transformer.transform(
                                    record,
                                    stream_schema,
                                    stream_metadata,
                                ))
                            counter.increment()
                else:
                    for page in stream.sync(client, bookmark_date):
                        for record in page:
                            if not max_bookmark_value:
                                max_bookmark_value = bookmark_date
                            max_bookmark_dttm = strptime_to_utc(
                                max_bookmark_value)

                            record_timestamp = stream.max_from_replication_dates(
                                record)
                            if record_timestamp > max_bookmark_dttm:
                                max_bookmark_value = strftime(record_timestamp)

                            if record_timestamp >= bookmark_dttm:
                                singer.write_record(
                                    catalog_entry.stream,
                                    transformer.transform(
                                        record,
                                        stream_schema,
                                        stream_metadata,
                                    ))
                                counter.increment()
                        stream.update_bookmark(stream.name, max_bookmark_value)
                        stream.write_state()
            stream.update_currently_syncing(None)
        stream.write_state()
        LOGGER.info('Finished Sync..')
Esempio n. 26
0
 def subtract_day(self, bookmark):
     bookmark_dt = strptime_to_utc(bookmark)
     adjusted_bookmark = bookmark_dt - timedelta(days=1)
     return strftime(adjusted_bookmark)
Esempio n. 27
0
def process_args():
    # Parse command line arguments
    args = utils.parse_args(REQUIRED_CONFIG_KEYS)

    # Check for errors on the provided config
    # params that utils.parse_args is letting through
    if not args.config.get('start_date'):
        LOGGER.critical(
            "tap-google-analytics: a valid start_date must be provided.")
        sys.exit(1)

    if not (args.config.get('reports') or args.catalog):
        LOGGER.critical(
            "tap-google-analytics: a catalog or report must be provided.")
        sys.exit(1)

    if not args.config.get('key_file_location') and \
       not args.config.get('oauth_credentials'):
        LOGGER.critical(
            "tap-google-analytics: a valid key_file_location string or \
oauth_credentials object must be provided.")
        sys.exit(1)

    # Remove optional args that have empty strings as values
    if 'reports' in args.config and not args.config.get('reports'):
        del args.config['reports']

    if 'end_date' in args.config and not args.config.get('end_date'):
        del args.config['end_date']

    # Process the [start_date, end_date) so that they define an open date
    # window that ends yesterday if end_date is not defined
    start_date = utils.strptime_to_utc(args.config['start_date'])
    args.config['start_date'] = utils.strftime(start_date, '%Y-%m-%d')

    end_date = args.config.get('end_date', utils.strftime(utils.now()))
    end_date = utils.strptime_to_utc(end_date) - datetime.timedelta(days=1)
    args.config['end_date'] = utils.strftime(end_date, '%Y-%m-%d')

    if end_date < start_date:
        LOGGER.critical(
            "tap-google-analytics: start_date '{}' > end_date '{}'".format(
                start_date, end_date))
        sys.exit(1)

    # If using a service account, validate that the client_secrets.json file
    # exists and load it
    if args.config.get('key_file_location'):
        if Path(args.config['key_file_location']).is_file():
            try:
                args.config['client_secrets'] = load_json(
                    args.config['key_file_location'])
            except ValueError:
                LOGGER.critical(
                    "tap-google-analytics: The JSON definition in '{}' has \
errors".format(args.config['key_file_location']))
                sys.exit(1)
        else:
            LOGGER.critical("tap-google-analytics: '{}' file not found".format(
                args.config['key_file_location']))
            sys.exit(1)
    else:
        # If using oauth credentials, verify that all required keys are present
        credentials = args.config['oauth_credentials']
        for key in [
                'access_token', 'refresh_token', 'client_id', 'client_secret'
        ]:
            if not credentials.get(key):
                LOGGER.critical(f"tap-google-analytics: a valid {key} for the \
oauth_credentials must be provided.")
                sys.exit(1)
    return args
Esempio n. 28
0
    def test_run(self):
        """
        Verify that we can get multiple pages of data for each stream
        """
        conn_id = connections.ensure_connection(self)
        self.run_and_verify_check_mode(conn_id)

        self.select_and_verify_fields(conn_id)

        first_sync_record_count = self.run_and_verify_sync(conn_id)

        first_sync_bookmarks = menagerie.get_state(conn_id)
        first_sync_records = runner.get_records_from_target_output()

        new_bookmarks = {}
        for stream_name, current_bookmark in first_sync_bookmarks[
                'bookmarks'].items():
            if stream_name == 'gl_accounts':
                new_gl_bookmarks = {
                    sub_stream: self.subtract_day(sub_bookmark)
                    for sub_stream, sub_bookmark in current_bookmark.items()
                }
                new_bookmarks[stream_name] = new_gl_bookmarks
            else:
                new_bookmarks[stream_name] = self.subtract_day(
                    current_bookmark)

        new_state = {"bookmarks": new_bookmarks}

        # Ensure the test is not the first to post a state
        poll_state_version(conn_id)

        menagerie.set_state(conn_id, new_state)

        # Run a sync job using orchestrator
        second_sync_record_count = self.run_and_verify_sync(conn_id)
        second_sync_bookmarks = menagerie.get_state(conn_id)
        second_sync_records = runner.get_records_from_target_output()

        for stream in self.expected_sync_streams():
            with self.subTest(stream=stream):
                replication_method = self.expected_replication_method().get(
                    stream)

                first_sync_count = first_sync_record_count.get(stream, 0)
                second_sync_count = second_sync_record_count.get(stream, 0)

                first_sync_messages = first_sync_records.get(stream, {}).get(
                    'messages', [])
                second_sync_messages = second_sync_records.get(stream, {}).get(
                    'messages', [])

                if replication_method == self.INCREMENTAL:
                    replication_key = self.expected_replication_keys().get(
                        stream).pop()

                    first_sync_bookmark_value = first_sync_bookmarks[
                        'bookmarks'][stream]
                    second_sync_bookmark_value = second_sync_bookmarks[
                        'bookmarks'][stream]
                    simulated_bookmark_value = new_state['bookmarks'][stream]

                    # Verify the both syncs end on the same bookmark
                    self.assertEqual(first_sync_bookmark_value,
                                     second_sync_bookmark_value)

                    # Verify that first sync records fall betwen the start date and the final
                    # bookmark value
                    for message in first_sync_messages:
                        lower_bound = strptime_to_utc(
                            self.get_properties()['start_date'])
                        actual_value = strptime_to_utc(
                            message.get('data').get(replication_key))
                        upper_bound = strptime_to_utc(
                            first_sync_bookmark_value)
                        self.assertTrue(
                            lower_bound <= actual_value <= upper_bound,
                            msg=
                            "First sync records fall outside of expected sync window"
                        )

                    # Verify the second sync records fall between simulated bookmark value and the
                    # final bookmark value
                    for message in second_sync_messages:
                        lower_bound = strptime_to_utc(simulated_bookmark_value)
                        actual_value = strptime_to_utc(
                            message.get('data', {}).get(replication_key))
                        upper_bound = strptime_to_utc(
                            second_sync_bookmark_value)
                        self.assertTrue(
                            lower_bound <= actual_value <= upper_bound,
                            msg=
                            "Second sync records fall outside of expected sync window"
                        )

                    # Verify the number of records in the 2nd sync is less then the first
                    self.assertLess(second_sync_count, first_sync_count)

                    # Verify at least 1 record was replicated in the second sync
                    self.assertGreater(
                        second_sync_count,
                        0,
                        msg="We are not fully testing bookmarking for {}".
                        format(stream))

                elif replication_method == self.FULL_TABLE:
                    # Verify no bookmark exists
                    self.assertNotIn(stream, first_sync_bookmarks['bookmarks'])
                    self.assertNotIn(stream,
                                     second_sync_bookmarks['bookmarks'])

                else:
                    raise NotImplementedError(
                        "invalid replication method: {}".format(
                            replication_method))
Esempio n. 29
0
def sync_engagements(STATE, ctx):
    catalog = ctx.get_catalog_from_id(singer.get_currently_syncing(STATE))
    mdata = metadata.to_map(catalog.get('metadata'))

    if "schema" in catalog:
        schema = catalog["schema"]
    else:
        schema = load_schema('engagements')

    bookmark_key = 'lastUpdated'
    singer.write_schema("engagements", schema, ["engagement_id"],
                        [bookmark_key], catalog.get('stream_alias'))
    start = get_start(STATE, "engagements", bookmark_key)

    # Because this stream doesn't query by `lastUpdated`, it cycles
    # through the data set every time. The issue with this is that there
    # is a race condition by which records may be updated between the
    # start of this table's sync and the end, causing some updates to not
    # be captured, in order to combat this, we must store the current
    # sync's start in the state and not move the bookmark past this value.
    current_sync_start = get_current_sync_start(STATE,
                                                "engagements") or utils.now()
    STATE = write_current_sync_start(STATE, "engagements", current_sync_start)
    singer.write_state(STATE)

    max_bk_value = start
    LOGGER.info("sync_engagements from %s", start)

    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key, start)
    singer.write_state(STATE)

    url = get_url("engagements_all")
    params = {'limit': 250}
    top_level_key = "results"
    engagements = gen_request(STATE, 'engagements', url, params, top_level_key,
                              "hasMore", ["offset"], ["offset"])

    time_extracted = utils.now()

    with Transformer(UNIX_MILLISECONDS_INTEGER_DATETIME_PARSING) as bumble_bee:
        for engagement in engagements:
            record = bumble_bee.transform(
                lift_properties_and_versions(engagement), schema, mdata)
            if record['engagement'][bookmark_key] >= start:
                # hoist PK and bookmark field to top-level record
                record['engagement_id'] = record['engagement']['id']
                record[bookmark_key] = record['engagement'][bookmark_key]
                singer.write_record("engagements",
                                    record,
                                    catalog.get('stream_alias'),
                                    time_extracted=time_extracted)
                if record['engagement'][bookmark_key] >= max_bk_value:
                    max_bk_value = record['engagement'][bookmark_key]

    # Don't bookmark past the start of this sync to account for updated records during the sync.
    new_bookmark = min(utils.strptime_to_utc(max_bk_value), current_sync_start)
    STATE = singer.write_bookmark(STATE, 'engagements', bookmark_key,
                                  utils.strftime(new_bookmark))
    STATE = write_current_sync_start(STATE, 'engagements', None)
    singer.write_state(STATE)
    return STATE
Esempio n. 30
0
def sync_endpoint(client,
                  config,
                  catalog,
                  state,
                  stream_name,
                  endpoint_config,
                  sync_streams,
                  selected_streams,
                  timezone_desc=None,
                  parent_id=None):

    # endpoint_config variables
    base_path = endpoint_config.get('path', stream_name)
    bookmark_field = next(iter(endpoint_config.get('replication_keys', [])),
                          None)
    params = endpoint_config.get('params', {})
    paging = endpoint_config.get('paging', False)
    bookmark_query_field_from = endpoint_config.get(
        'bookmark_query_field_from')
    bookmark_query_field_to = endpoint_config.get('bookmark_query_field_to')
    targeting_group = endpoint_config.get('targeting_group')
    targeting_type = endpoint_config.get('targeting_type')
    targeting_country_ind = endpoint_config.get('targeting_country_ind', False)
    data_key_array = endpoint_config.get('data_key_array')
    data_key_record = endpoint_config.get('data_key_record').format(
        targeting_type=targeting_type)
    id_fields = endpoint_config.get('key_properties')
    parent = endpoint_config.get('parent')
    date_window_size = int(endpoint_config.get('date_window_size', '1'))

    # tap config variabless
    start_date = config.get('start_date')
    swipe_up_attribution_window = config.get('swipe_up_attribution_window',
                                             '28_DAY')
    view_attribution_window = config.get('view_attribution_window', '7_DAY')

    swipe_up_attr = int(swipe_up_attribution_window.replace('_DAY', ''))

    if view_attribution_window in (
            '1_HOUR',
            '3_HOUR',
            '6_HOUR',
    ):
        view_attr = 1
    else:
        view_attr = int(view_attribution_window.replace('_DAY', ''))

    attribution_window = max(1, swipe_up_attr, view_attr)

    omit_empty = config.get('omit_empty', 'true')
    if '_stats_' in stream_name:
        params['omit_empty'] = omit_empty

    country_codes = config.get('targeting_country_codes',
                               'us').replace(' ', '').lower()
    if targeting_country_ind:
        country_code_list = country_codes.split(',')
    else:
        country_code_list = ['none']

    # Get the timezone and latest bookmark for the stream
    if not timezone_desc:
        timezone = pytz.timezone('UTC')
    else:
        timezone = pytz.timezone(timezone_desc)
    LOGGER.info('timezone = {}'.format(timezone))

    last_datetime = get_bookmark(state, stream_name, start_date,
                                 bookmark_field, parent, parent_id)
    max_bookmark_value = last_datetime

    # Convert to datetimes in local/ad account timezone
    now_datetime = utils.now()
    last_dttm = strptime_to_utc(last_datetime)

    report_granularity = params.get('granularity', 'HOUR')
    if '_stats_' in stream_name:
        LOGGER.info('report_granularity: {}'.format(report_granularity))

    if bookmark_query_field_from and bookmark_query_field_to:
        # date_window_size: Number of days in each date window
        # Set start window
        start_window = now_datetime - timedelta(days=attribution_window)
        if last_dttm < start_window:
            start_window = last_dttm
        # Set end window
        end_window = start_window + timedelta(days=date_window_size)

    else:
        start_window = last_dttm
        end_window = now_datetime
        diff_sec = (end_window - start_window).seconds
        date_window_size = math.ceil(
            diff_sec / (3600 * 24))  # round-up difference to days

    endpoint_total = 0
    total_records = 0

    while start_window < now_datetime:
        LOGGER.info('START Sync for Stream: {}{}'.format(
            stream_name,
            ', Date window from: {} to {}'.format(start_window.date(), end_window.date()) \
                if bookmark_query_field_from else ''))

        if bookmark_query_field_from and bookmark_query_field_to:
            # Query parameter startDate and endDate must be in Eastern time zone
            # API will error if future dates are requested
            if report_granularity == 'DAY':
                window_start_dt_str = remove_hours_local(
                    start_window, timezone)
                window_end_dt_str = remove_hours_local(end_window, timezone)
                if window_start_dt_str == window_end_dt_str:
                    window_end_dt_str = remove_hours_local(
                        end_window + timedelta(days=1), timezone)
            else:
                window_start_dt_str = remove_minutes_local(
                    start_window, timezone)
                window_end_dt_str = remove_minutes_local(end_window, timezone)
                if window_start_dt_str == window_end_dt_str:
                    window_end_dt_str = remove_hours_local(
                        end_window + timedelta(hours=1), timezone)

            params[bookmark_query_field_from] = window_start_dt_str
            params[bookmark_query_field_to] = window_end_dt_str

        # This loop will run once for non-country_code endpoints
        #   and one or more times (for each country) for country_code endpoints
        for country_code in country_code_list:
            # Path
            if stream_name.startswith('targeting_'):
                path = base_path.format(targeting_group=targeting_group,
                                        targeting_type=targeting_type,
                                        country_code=country_code,
                                        parent_id=parent_id)
            else:
                path = base_path.format(country_code=country_code,
                                        parent_id=parent_id)

            # pagination: loop thru all pages of data using next (if not None)
            #   Reference: https://developers.snapchat.com/api/docs/#pagination
            total_records = 0
            offset = 1
            page = 1
            if paging:
                limit = 500  # Allowed values: 50 - 1000
                params['limit'] = limit
            else:
                limit = None

            for key, val in params.items():
                # Replace variables in params
                new_val = str(val).format(
                    swipe_up_attribution_window=swipe_up_attribution_window,
                    view_attribution_window=view_attribution_window)
                params[key] = new_val
            # concate params
            querystring = '&'.join(
                ['%s=%s' % (key, value) for (key, value) in params.items()])

            # initialize next_url
            next_url = '{}/{}?{}'.format(client.base_url, path, querystring)

            # pagination loop
            while next_url is not None:

                # API request data
                data = {}
                try:
                    data = client.get(url=next_url, endpoint=stream_name)
                except Exception as err:
                    LOGGER.error('{}'.format(err))
                    LOGGER.error('URL for Stream {}: {}'.format(
                        stream_name, next_url))
                    raise Exception(err)

                # time_extracted: datetime when the data was extracted from the API
                time_extracted = utils.now()
                if not data or data is None or data == {}:
                    LOGGER.info('No data results returned')
                    total_records = 0
                    break  # No data results

                request_status = data.get('request_status')
                if request_status != 'SUCCESS':
                    raise RuntimeError(data)

                # Get pagination next_url
                next_url = data.get('paging', {}).get('next_link', None)

                # Transform data with transform_json from transform.py
                # The data_key_array identifies the array/list of records below the <root> element
                # LOGGER.info('data = {}'.format(data)) # TESTING, comment out
                transformed_data = []  # initialize the record list

                # Reports stats streams de-nesting
                if '_stats_' in stream_name:
                    for data_record in data.get(data_key_array, []):
                        base_record = data_record.get(data_key_record, {})
                        records = base_record.get('timeseries', [])
                        for record in records:
                            # Add parent base_record fields to record
                            for key, val in base_record.items():
                                if key not in ('start_time', 'end_time',
                                               'timeseries'):
                                    record[key] = val

                            # De-nest stats
                            stats = record.get('stats', {})
                            for key, val in stats.items():
                                record[key] = val
                            record.pop('stats', None)

                            # transform record
                            try:
                                transformed_record = humps.decamelize(record)
                            except Exception as err:
                                LOGGER.error('{}'.format(err))
                                # LOGGER.error('error record: {}'.format(record)) # COMMENT OUT
                                raise Exception(err)

                            # verify primary_keys are in tansformed_record
                            if 'id' not in transformed_record or 'start_time' not in transformed_record:
                                LOGGER.error(
                                    'Stream: {}, Missing key (id or start_time)'
                                    .format(stream_name))
                                LOGGER.error('transformed_record: {}'.format(
                                    transformed_record))
                                raise RuntimeError

                            transformed_data.append(transformed_record)
                            # End for record in records
                        # End for data_record in array
                    # End stats stream

                # Other streams de-nesting
                else:  # Not stats stream
                    for data_record in data.get(data_key_array, []):
                        sub_request_status = data_record.get(
                            'sub_request_status')
                        if sub_request_status != 'SUCCESS':
                            raise RuntimeError(data_record)

                        record = data_record.get(data_key_record, {})

                        # Transforms to align schemas for targeting streams
                        if stream_name.startswith('targeting_'):
                            record['targeting_group'] = targeting_group
                            record['targeting_type'] = targeting_type
                            if country_code != 'none':
                                record['country_code'] = country_code
                            if targeting_group == 'geo':
                                record_id = record.get(targeting_type,
                                                       {}).get('id')
                                record_name = record.get(targeting_type,
                                                         {}).get('name')
                                record['id'] = record_id
                                record['name'] = record_name
                            if targeting_type == 'postal_code':
                                record_id = record.get('postalCode')
                                record['id'] = record_id
                                record['name'] = record_id
                                record.pop('postalCode')

                        # Add parent id field/value
                        if parent and parent_id:
                            parent_key = '{}_id'.format(parent)
                            record[parent_key] = parent_id

                        # transform record (remove inconsistent use of CamelCase)
                        try:
                            transformed_record = humps.decamelize(record)
                        except Exception as err:
                            LOGGER.error('{}'.format(err))
                            LOGGER.error('error record: {}'.format(record))
                            raise Exception(err)

                        # verify primary_keys are in tansformed_record
                        for key in id_fields:
                            if not transformed_record.get(key):
                                LOGGER.error(
                                    'Stream: {}, Missing key {}'.format(
                                        stream_name, key))
                                LOGGER.info('transformed_record: {}'.format(
                                    transformed_record))
                                raise RuntimeError

                        transformed_data.append(transformed_record)
                        # End for data_record in array
                    # End non-stats stream

                # LOGGER.info('transformed_data = {}'.format(transformed_data)) # COMMENT OUT
                if not transformed_data or transformed_data is None:
                    LOGGER.info(
                        'No transformed data for data = {}'.format(data))
                    total_records = 0
                    break  # No transformed_data results

                # Process records and get the max_bookmark_value and record_count
                if stream_name in sync_streams:
                    max_bookmark_value, record_count = process_records(
                        catalog=catalog,
                        stream_name=stream_name,
                        records=transformed_data,
                        time_extracted=time_extracted,
                        bookmark_field=bookmark_field,
                        max_bookmark_value=max_bookmark_value,
                        last_datetime=last_datetime)
                    LOGGER.info('Stream {}, batch processed {} records'.format(
                        stream_name, record_count))

                # Loop thru parent batch records for each children objects (if should stream)
                children = endpoint_config.get('children')
                if children:
                    for child_stream_name, child_endpoint_config in children.items(
                    ):
                        if child_stream_name in sync_streams:
                            LOGGER.info(
                                'START Syncing: {}'.format(child_stream_name))
                            write_schema(catalog, child_stream_name)
                            # For each parent record
                            for record in transformed_data:
                                i = 0
                                # Set parent_id
                                for id_field in id_fields:
                                    if i == 0:
                                        parent_id_field = id_field
                                    if id_field == 'id':
                                        parent_id_field = id_field
                                    i = i + 1
                                parent_id = record.get(parent_id_field)

                                if stream_name == 'ad_accounts':
                                    timezone_desc = record.get(
                                        'timezone', timezone_desc)

                                # sync_endpoint for child
                                LOGGER.info(
                                    'START Sync for Stream: {}, parent_stream: {}, parent_id: {}'\
                                        .format(child_stream_name, stream_name, parent_id))

                                child_total_records = sync_endpoint(
                                    client=client,
                                    config=config,
                                    catalog=catalog,
                                    state=state,
                                    stream_name=child_stream_name,
                                    endpoint_config=child_endpoint_config,
                                    sync_streams=sync_streams,
                                    selected_streams=selected_streams,
                                    timezone_desc=timezone_desc,
                                    parent_id=parent_id)

                                LOGGER.info(
                                    'FINISHED Sync for Stream: {}, parent_id: {}, total_records: {}'\
                                        .format(child_stream_name, parent_id, child_total_records))
                                # End transformed data record loop
                            # End if child in sync_streams
                        # End child streams for parent
                    # End if children

                # Parent record batch
                total_records = total_records + record_count
                endpoint_total = endpoint_total + record_count

                LOGGER.info(
                    'Synced Stream: {}, page: {}, records: {} to {}'.format(
                        stream_name, page, offset, total_records))
                # Pagination: increment the offset by the limit (batch-size) and page
                if limit:
                    offset = offset + limit
                page = page + 1
                # End page/batch - while next URL loop
            # End country_code loop

        # Update the state with the max_bookmark_value for the stream date window
        # Snapchat Ads API does not allow page/batch sorting; bookmark written for date window
        if bookmark_field and stream_name in selected_streams:
            write_bookmark(state, stream_name, max_bookmark_value,
                           bookmark_field, parent, parent_id)

        # Increment date window and sum endpoint_total
        start_window = end_window
        next_end_window = end_window + timedelta(days=date_window_size)
        if next_end_window > now_datetime:
            end_window = now_datetime
        else:
            end_window = next_end_window
        # End date window

    # Return total_records (for all pages and date windows)
    return endpoint_total