def request_params(self, stream_state: Mapping[str, Any], next_page_token: Mapping[str, Any] = None, **kwargs) -> MutableMapping[str, Any]: if next_page_token: return dict(next_page_token) # for finance APIs, end date-time must be no later than two minutes before the request was submitted end_date = pendulum.now("utc").subtract( minutes=2, seconds=10).strftime(DATE_TIME_FORMAT) if self._replication_end_date: end_date = self._replication_end_date # start date and end date should not be more than 180 days apart. start_date = max(pendulum.parse(self._replication_start_date), pendulum.parse(end_date).subtract( days=180)).strftime(DATE_TIME_FORMAT) # logging to make sure user knows taken start date logger.info("start date used: %s", start_date) params = { self.replication_start_date_field: start_date, self.replication_end_date_field: end_date, self.page_size_field: self.page_size, } return params
def sleep_on_ratelimit(_details): _, exc, _ = sys.exc_info() if isinstance(exc, FreshdeskRateLimited): retry_after = int(exc.response.headers["Retry-After"]) logger.info( f"Rate limit reached. Sleeping for {retry_after} seconds") time.sleep(retry_after + 1) # extra second to cover any fractions of second
def state(self, value): potentially_new_records_in_the_past = self._include_deleted and not value.get( "include_deleted", False) if potentially_new_records_in_the_past: logger.info( f"Ignoring bookmark for {self.name} because of enabled `include_deleted` option" ) else: self._state = pendulum.parse(value[self.state_pk])
def sleep_on_ratelimit(_details): _, exc, _ = sys.exc_info() if isinstance(exc, HubspotRateLimited): # HubSpot API does not always return Retry-After value for 429 HTTP error retry_after = int(exc.response.headers.get("Retry-After", 3)) logger.info( f"Rate limit reached. Sleeping for {retry_after} seconds") time.sleep(retry_after + 1) # extra second to cover any fractions of second
def _open(self, binary): mode = "rb" if binary else "r" storage = self.storage_scheme url = self.url if storage == "gs://": return self._open_gcs_url(binary=binary) elif storage == "s3://": return self._open_aws_url(binary=binary) elif storage == "azure://": return self._open_azblob_url(binary=binary) elif storage == "webhdfs://": host = self._provider["host"] port = self._provider["port"] return smart_open.open(f"webhdfs://{host}:{port}/{url}", mode=mode) elif storage in ("ssh://", "scp://", "sftp://"): user = self._provider["user"] host = self._provider["host"] # TODO: Remove int casting when https://github.com/airbytehq/airbyte/issues/4952 is addressed # TODO: The "port" field in spec.json must also be changed _port_value = self._provider.get("port", 22) try: port = int(_port_value) except ValueError as err: raise ValueError( f"{_port_value} is not a valid integer for the port" ) from err # Explicitly turn off ssh keys stored in ~/.ssh transport_params = {"connect_kwargs": {"look_for_keys": False}} if "password" in self._provider: password = self._provider["password"] uri = f"{storage}{user}:{password}@{host}:{port}/{url}" else: uri = f"{storage}{user}@{host}:{port}/{url}" return smart_open.open(uri, transport_params=transport_params, mode=mode) elif storage in ("https://", "http://"): transport_params = None if self._provider["user_agent"]: airbyte_version = environ.get("AIRBYTE_VERSION", "0.0") transport_params = { "headers": { "Accept-Encoding": "identity", "User-Agent": f"Airbyte/{airbyte_version}" } } logger.info(f"TransportParams: {transport_params}") return smart_open.open( self.full_url, mode=mode, transport_params=transport_params, ) return smart_open.open(self.full_url, mode=mode)
def read(self, getter: Callable, params: Mapping[str, Any] = None) -> Iterator: """Apply state filter to set of records, update cursor(state) if necessary in the end""" params = params or {} latest_cursor = None for record in super().read(getter, params): cursor = pendulum.parse(record[self.state_pk]) if self._state and self._state.subtract(days=self.buffer_days + 1) >= cursor: continue latest_cursor = max(cursor, latest_cursor) if latest_cursor else cursor yield record if latest_cursor: logger.info(f"Advancing bookmark for {self.name} stream from {self._state} to {latest_cursor}") self._state = max(latest_cursor, self._state) if self._state else latest_cursor
def consume(self, credit: int): # Reset time window if it has elapsed if time.time( ) > self._current_period_start + self._balance_reload_period: self.reset_period() if self._credits_consumed + credit >= self._max_balance: sleep_time = self._balance_reload_period - ( time.time() - self._current_period_start) logger.info( f"Reached call limit for this minute, wait for {sleep_time:.2f} seconds" ) time.sleep(max(1.0, sleep_time)) self.reset_period() self._credits_consumed += credit
def read_chunked( self, getter: Callable, params: Mapping[str, Any] = None, chunk_size: pendulum.duration = pendulum.duration(days=1) ) -> Iterator: params = {**params} if params else {} now_ts = int(pendulum.now().timestamp() * 1000) start_ts = int(self._start_date.timestamp() * 1000) chunk_size = int(chunk_size.total_seconds() * 1000) for ts in range(start_ts, now_ts, chunk_size): end_ts = ts + chunk_size params["startTimestamp"] = ts params["endTimestamp"] = end_ts logger.info( f"Reading chunk from stream {self.name} between {pendulum.from_timestamp(ts / 1000)} and {pendulum.from_timestamp(end_ts / 1000)}" ) yield from super().read(getter, params)
def read(self, getter: Callable, params: Mapping[str, Any] = None) -> Iterator: """Update cursor(state)""" params = params or {} cursor = None for record in super().read(getter, params): "Report API return records from newest to oldest" if not cursor: cursor = pendulum.parse(record[self.state_pk]) record[self.state_pk] = pendulum.parse( record[self.state_pk]).isoformat() yield record if cursor: new_state = max(cursor, self._state) if self._state else cursor if new_state != self._state: logger.info( f"Advancing bookmark for {self.name} stream from {self._state} to {new_state}" ) self._state = new_state
def _run_job_until_completion(self, params) -> AdReportRun: # TODO parallelize running these jobs job = self._get_insights(params) logger.info(f"Created AdReportRun: {job} to sync insights with breakdown {self.breakdowns}") start_time = pendulum.now() sleep_seconds = 2 while True: job = job.api_get() job_progress_pct = job["async_percent_completion"] logger.info(f"ReportRunId {job['report_run_id']} is {job_progress_pct}% complete") runtime = pendulum.now() - start_time if job["async_status"] == "Job Completed": return job elif job["async_status"] == "Job Failed": raise JobTimeoutException(f"AdReportRun {job} failed after {runtime.in_seconds()} seconds.") elif job["async_status"] == "Job Skipped": raise JobTimeoutException(f"AdReportRun {job} skipped after {runtime.in_seconds()} seconds.") if runtime > self.MAX_WAIT_TO_START and job_progress_pct == 0: raise JobTimeoutException( f"AdReportRun {job} did not start after {runtime.in_seconds()} seconds. This is an intermittent error which may be fixed by retrying the job. Aborting." ) elif runtime > self.MAX_WAIT_TO_FINISH: raise JobTimeoutException( f"AdReportRun {job} did not finish after {runtime.in_seconds()} seconds. This is an intermittent error which may be fixed by retrying the job. Aborting." ) logger.info(f"Sleeping {sleep_seconds} seconds while waiting for AdReportRun: {job} to complete") time.sleep(sleep_seconds) if sleep_seconds < self.MAX_ASYNC_SLEEP.in_seconds(): sleep_seconds *= 2
def read(self, getter: Callable, params: Mapping[str, Any] = None) -> Iterator: """Read using getter, patched to respect current state""" params = params or {} params = {**params, **self._state_params()} latest_cursor = None for record in super().read(getter, params): cursor = pendulum.parse(record[self.state_pk]) # filter out records older then state if self._state and self._state >= cursor: continue latest_cursor = max(cursor, latest_cursor) if latest_cursor else cursor yield record if latest_cursor: logger.info( f"Advancing bookmark for {self.name} stream from {self._state} to {latest_cursor}" ) self._state = max(latest_cursor, self._state) if self._state else latest_cursor
def read(self, getter: Callable, params: Mapping[str, Any] = None) -> Iterator: """Apply state filter to set of records, update cursor(state) if necessary in the end""" latest_cursor = None # to track state, there is no guarantee that returned records sorted in ascending order. Having exact # boundary we could always ensure we don't miss records between states. In the future, if we would # like to save the state more often we can do this every batch for record in self.read_chunked(getter, params): yield record cursor = self._field_to_datetime(record[self.updated_at_field]) latest_cursor = max(cursor, latest_cursor) if latest_cursor else cursor if latest_cursor: new_state = max(latest_cursor, self._state) if self._state else latest_cursor if new_state != self._state: logger.info( f"Advancing bookmark for {self.name} stream from {self._state} to {latest_cursor}" ) self._state = new_state self._start_date = self._state
def handle_call_rate_response(exc: FacebookRequestError) -> bool: pause_time = DEFAULT_SLEEP_INTERVAL platform_header = exc.http_headers().get("x-app-usage") or exc.http_headers().get("x-ad-account-usage") if platform_header: platform_header = json.loads(platform_header) call_count = platform_header.get("call_count") or platform_header.get("acc_id_util_pct") if call_count and call_count > 99: logger.info(f"Reached platform call limit: {exc}") buc_header = exc.http_headers().get("x-business-use-case-usage") buc_header = json.loads(buc_header) if buc_header else {} for business_object_id, stats in buc_header.items(): if stats.get("call_count", 0) > 99: logger.info(f"Reached call limit on {stats['type']}: {exc}") pause_time = max(pause_time, stats["estimated_time_to_regain_access"]) logger.info(f"Sleeping for {pause_time.total_seconds()} seconds") sleep(pause_time.total_seconds()) return True
def log_retry_attempt(details): _, exc, _ = sys.exc_info() logger.info(str(exc)) logger.info( f"Caught retryable error after {details['tries']} tries. Waiting {details['wait']} more seconds then retrying..." )