Exemple #1
0
    def sync(self, state):
        bookmark = self.get_bookmark(state)
        original_search_window_size = int(
            self.config.get('search_window_size', DEFAULT_SEARCH_WINDOW_SIZE))
        search_window_size = original_search_window_size
        # We substract a second here because the API seems to compare
        # start_time with a >, but we typically prefer a >= behavior.
        # Also, the start_time query parameter filters based off of
        # created_at, but zendesk support confirmed with us that
        # satisfaction_ratings are immutable so that created_at =
        # updated_at
        #start = bookmark_epoch-1
        start = bookmark - datetime.timedelta(seconds=1)
        end = start + datetime.timedelta(seconds=search_window_size)
        sync_end = singer.utils.now() - datetime.timedelta(minutes=1)
        epoch_sync_end = int(sync_end.strftime('%s'))
        parsed_sync_end = singer.strftime(sync_end, "%Y-%m-%dT%H:%M:%SZ")

        while start < sync_end:
            epoch_start = int(start.strftime('%s'))
            parsed_start = singer.strftime(start, "%Y-%m-%dT%H:%M:%SZ")
            epoch_end = int(end.strftime('%s'))
            parsed_end = singer.strftime(end, "%Y-%m-%dT%H:%M:%SZ")

            LOGGER.info("Querying for satisfaction ratings between %s and %s",
                        parsed_start, min(parsed_end, parsed_sync_end))
            satisfaction_ratings = self.client.satisfaction_ratings(
                start_time=epoch_start,
                end_time=min(epoch_end, epoch_sync_end))
            # NB: We've observed that the tap can sync 50k records in ~15
            # minutes, due to this, the tap will adjust the time range
            # dynamically to ensure bookmarks are able to be written in
            # cases of high volume.
            if satisfaction_ratings.count > 50000:
                search_window_size = search_window_size // 2
                end = start + datetime.timedelta(seconds=search_window_size)
                LOGGER.info(
                    "satisfaction_ratings - Detected Search API response size for this window is too large (> 50k). Cutting search window in half to %s seconds.",
                    search_window_size)
                continue
            for satisfaction_rating in satisfaction_ratings:
                assert parsed_start <= satisfaction_rating.updated_at, "satisfaction_ratings - Record found before date window start. Details: window start ({}) is not less than or equal to updated_at ({})".format(
                    parsed_start, satisfaction_rating.updated_at)
                if bookmark < utils.strptime_with_tz(
                        satisfaction_rating.updated_at) <= end:
                    # NB: We don't trust that the records come back ordered by
                    # updated_at (we've observed out-of-order records),
                    # so we can't save state until we've seen all records
                    self.update_bookmark(state, satisfaction_rating.updated_at)
                if parsed_start <= satisfaction_rating.updated_at <= parsed_end:
                    yield (self.stream, satisfaction_rating)
            if search_window_size <= original_search_window_size // 2:
                search_window_size = search_window_size * 2
                LOGGER.info(
                    "Successfully requested records. Doubling search window to %s seconds",
                    search_window_size)
            singer.write_state(state)

            start = end - datetime.timedelta(seconds=1)
            end = start + datetime.timedelta(seconds=search_window_size)
Exemple #2
0
    def sync(self, state):
        original_search_window_size = int(self.config.get('search_window_size', DEFAULT_SEARCH_WINDOW_SIZE))
        search_window_size = original_search_window_size
        bookmark = self.get_bookmark(state)
        start = bookmark - datetime.timedelta(seconds=1)
        end = start + datetime.timedelta(seconds=search_window_size)
        sync_end = singer.utils.now() - datetime.timedelta(minutes=1)
        parsed_sync_end = singer.strftime(sync_end, "%Y-%m-%dT%H:%M:%SZ")

        # ASSUMPTION: updated_at value always comes back in utc
        num_retries = 0
        while start < sync_end:
            parsed_start = singer.strftime(start, "%Y-%m-%dT%H:%M:%SZ")
            parsed_end = min(singer.strftime(end, "%Y-%m-%dT%H:%M:%SZ"), parsed_sync_end)
            LOGGER.info("Querying for users between %s and %s", parsed_start, parsed_end)
            users = self.client.search("", updated_after=parsed_start, updated_before=parsed_end, type="user")

            # NB: Zendesk will return an error on the 1001st record, so we
            # need to check total response size before iterating
            # See: https://develop.zendesk.com/hc/en-us/articles/360022563994--BREAKING-New-Search-API-Result-Limits
            if users.count > 1000:
                if search_window_size > 1:
                    search_window_size = search_window_size // 2
                    end = start + datetime.timedelta(seconds=search_window_size)
                    LOGGER.info("users - Detected Search API response size too large. Cutting search window in half to %s seconds.", search_window_size)
                    continue

                raise Exception("users - Unable to get all users within minimum window of a single second ({}), found {} users within this timestamp. Zendesk can only provide a maximum of 1000 users per request. See: https://develop.zendesk.com/hc/en-us/articles/360022563994--BREAKING-New-Search-API-Result-Limits".format(parsed_start, users.count))

            # Consume the records to account for dates lower than window start
            users = [user for user in users] # pylint: disable=unnecessary-comprehension

            if not all(parsed_start <= user.updated_at for user in users):
                # Only retry up to 30 minutes (60 attempts at 30 seconds each)
                if num_retries < 60:
                    LOGGER.info("users - Record found before date window start. Waiting 30 seconds, then retrying window for consistency. (Retry #%s)", num_retries + 1)
                    time.sleep(30)
                    num_retries += 1
                    continue
                raise AssertionError("users - Record found before date window start and did not resolve after 30 minutes of retrying. Details: window start ({}) is not less than or equal to updated_at value(s) {}".format(
                        parsed_start, [str(user.updated_at) for user in users if user.updated_at < parsed_start]))

            # If we make it here, all quality checks have passed. Reset retry count.
            num_retries = 0
            for user in users:
                if parsed_start <= user.updated_at <= parsed_end:
                    yield (self.stream, user)
            self.update_bookmark(state, parsed_end)

            # Assumes that the for loop got everything
            singer.write_state(state)
            if search_window_size <= original_search_window_size // 2:
                search_window_size = search_window_size * 2
                LOGGER.info("Successfully requested records. Doubling search window to %s seconds", search_window_size)
            start = end - datetime.timedelta(seconds=1)
            end = start + datetime.timedelta(seconds=search_window_size)
Exemple #3
0
def transform_datetime_string(dts):
    parsed_dt = dateutil.parser.parse(dts)
    if parsed_dt.tzinfo is None:
        parsed_dt = parsed_dt.replace(tzinfo=timezone.utc)
    else:
        parsed_dt = parsed_dt.astimezone(timezone.utc)
    return singer.strftime(parsed_dt)
Exemple #4
0
def populate_simple_table(dynamodb):
    print('\nPopulating table: simple_table')
    num_items = 50
    table = dynamodb.Table('simple_table')
    table.wait_until_exists()
    start_datetime = datetime.datetime(2018, 1, 1, 0, 0, 0, 0,
                                       tzinfo=datetime.timezone.utc)
    for int_value in range(num_items):
        item_dt = start_datetime + datetime.timedelta(days=(5*int_value))
        table.put_item(
            Item={
                "id": int_value,
                "string_field": random_string_generator(),
                "date_field": singer.strftime(item_dt)
            }
        )

    # wait for global secondary index to be backfilled
    while True:
        if not table.global_secondary_indexes or table.global_secondary_indexes[0]['IndexStatus'] != 'ACTIVE':
            print('Waiting for index to backfill...')
            time.sleep(5)
            table.reload()
        else:
            break

    print('Added {} items to table: simple_table'.format(num_items))