def _make_request(self, http_method, url, headers=None, body=None, stream=False, params=None): if http_method == "GET": resp = self.session.get(url, headers=headers, stream=stream, params=params) elif http_method == "POST": resp = self.session.post(url, headers=headers, data=body) else: raise TapSalesforceException("Unsupported HTTP method") try: resp.raise_for_status() except RequestException as ex: raise ex if resp.headers.get("Sforce-Limit-Info") is not None: self.rest_requests_attempted += 1 self.check_rest_quota_usage(resp.headers) return resp
def _make_request(self, http_method, url, headers=None, body=None, stream=False, params=None): if http_method == "GET": LOGGER.info("Making %s request to %s with params: %s", http_method, url, params) resp = self.session.get(url, headers=headers, stream=stream, params=params) elif http_method == "POST": LOGGER.info("Making %s request to %s with body %s", http_method, url, body) resp = self.session.post(url, headers=headers, data=body) else: raise TapSalesforceException("Unsupported HTTP method") resp.raise_for_status() if resp.headers.get('Sforce-Limit-Info') is not None: self.rest_requests_attempted += 1 self.check_rest_quota_usage(resp.headers) return resp
def get_blacklisted_fields(self): if self.api_type == BULK_API_TYPE: return {('EntityDefinition', 'RecordTypesSupported'): "this field is unsupported by the Bulk API."} elif self.api_type == REST_API_TYPE: return {} else: raise TapSalesforceException( "api_type should be REST or BULK was: {}".format( self.api_type))
def field_to_property_schema(field, mdata): property_schema = {} field_name = field['name'] sf_type = field['type'] if sf_type in STRING_TYPES: property_schema['type'] = "string" elif sf_type in DATE_TYPES: date_type = {"type": "string", "format": "date-time"} string_type = {"type": ["string", "null"]} property_schema["anyOf"] = [date_type, string_type] elif sf_type == "boolean": property_schema['type'] = "boolean" elif sf_type in NUMBER_TYPES: property_schema['type'] = "number" elif sf_type == "address": property_schema['type'] = "object" property_schema['properties'] = { "street": {"type": ["null", "string"]}, "state": {"type": ["null", "string"]}, "postalCode": {"type": ["null", "string"]}, "city": {"type": ["null", "string"]}, "country": {"type": ["null", "string"]}, "longitude": {"type": ["null", "number"]}, "latitude": {"type": ["null", "number"]}, "geocodeAccuracy": {"type": ["null", "string"]} } elif sf_type == "int": property_schema['type'] = "integer" elif sf_type == "time": property_schema['type'] = "string" elif sf_type in LOOSE_TYPES: return property_schema, mdata # No type = all types elif sf_type in BINARY_TYPES: mdata = metadata.write(mdata, ('properties', field_name), "inclusion", "unsupported") mdata = metadata.write(mdata, ('properties', field_name), "unsupported-description", "binary data") return property_schema, mdata elif sf_type == 'location': # geo coordinates are numbers or objects divided into two fields for lat/long property_schema['type'] = ["number", "object", "null"] property_schema['properties'] = { "longitude": {"type": ["null", "number"]}, "latitude": {"type": ["null", "number"]} } elif sf_type == 'json': property_schema['type'] = "string" else: raise TapSalesforceException("Found unsupported type: {}".format(sf_type)) # The nillable field cannot be trusted if field_name != 'Id' and sf_type != 'location' and sf_type not in DATE_TYPES: property_schema['type'] = ["null", property_schema['type']] return property_schema, mdata
def get_blacklisted_objects(self): if self.api_type == BULK_API_TYPE: return UNSUPPORTED_BULK_API_SALESFORCE_OBJECTS.union( QUERY_RESTRICTED_SALESFORCE_OBJECTS).union(QUERY_INCOMPATIBLE_SALESFORCE_OBJECTS) elif self.api_type == REST_API_TYPE: return QUERY_RESTRICTED_SALESFORCE_OBJECTS.union(QUERY_INCOMPATIBLE_SALESFORCE_OBJECTS) else: raise TapSalesforceException( "api_type should be REST or BULK was: {}".format( self.api_type))
def query(self, catalog_entry, state): if self.api_type == BULK_API_TYPE: bulk = Bulk(self) return bulk.query(catalog_entry, state) elif self.api_type == REST_API_TYPE: rest = Rest(self) return rest.query(catalog_entry, state) else: raise TapSalesforceException( "api_type should be REST or BULK was: {}".format( self.api_type))
def _make_request(self, http_method, url, headers=None, body=None, stream=False, params=None): request_timeout = 5 * 60 # 5 minute request timeout try: if http_method == "GET": LOGGER.info("Making %s request to %s with params: %s", http_method, url, params) resp = self.session.get( url, headers=headers, stream=stream, params=params, timeout=request_timeout, ) elif http_method == "POST": LOGGER.info("Making %s request to %s with body %s", http_method, url, body) resp = self.session.post( url, headers=headers, data=body, timeout=request_timeout, ) else: raise TapSalesforceException("Unsupported HTTP method") except requests.exceptions.ConnectionError as connection_err: LOGGER.error( 'Took longer than %s seconds to connect to the server', request_timeout) raise connection_err except requests.exceptions.Timeout as timeout_err: LOGGER.error('Took longer than %s seconds to hear from the server', request_timeout) raise timeout_err try: resp.raise_for_status() except RequestException as ex: raise ex if resp.headers.get('Sforce-Limit-Info') is not None: self.rest_requests_attempted += 1 self.check_rest_quota_usage(resp.headers) return resp
def _bulk_query(self, catalog_entry, state): job_id = self._create_job(catalog_entry) start_date = self.sf.get_start_date(state, catalog_entry) batch_id = self._add_batch(catalog_entry, job_id, start_date) self._close_job(job_id) batch_status = self._poll_on_batch_status(job_id, batch_id) if batch_status["state"] == "Failed": if "QUERY_TIMEOUT" in batch_status["stateMessage"]: batch_status = self._bulk_query_with_pk_chunking( catalog_entry, start_date) job_id = batch_status["job_id"] # Set pk_chunking to True to indicate that we should write a bookmark differently self.sf.pk_chunking = True # Add the bulk Job ID and its batches to the state so it can be resumed if necessary tap_stream_id = catalog_entry["tap_stream_id"] state = singer.write_bookmark(state, tap_stream_id, "JobID", job_id) state = singer.write_bookmark(state, tap_stream_id, "BatchIDs", batch_status["completed"][:]) for completed_batch_id in batch_status["completed"]: for result in self.get_batch_results( job_id, completed_batch_id, catalog_entry): yield result # Remove the completed batch ID and write state state["bookmarks"][catalog_entry["tap_stream_id"]][ "BatchIDs"].remove(completed_batch_id) LOGGER.info( "Finished syncing batch %s. Removing batch from state.", completed_batch_id, ) LOGGER.info( "Batches to go: %d", len(state["bookmarks"][catalog_entry["tap_stream_id"]] ["BatchIDs"]), ) singer.write_state(state) else: raise TapSalesforceException(batch_status["stateMessage"]) else: for result in self.get_batch_results(job_id, batch_id, catalog_entry): yield result
def _bulk_query_with_pk_chunking(self, catalog_entry, start_date): LOGGER.info("Retrying Bulk Query with PK Chunking") # Create a new job job_id = self._create_job(catalog_entry, True) self._add_batch(catalog_entry, job_id, start_date, False) batch_status = self._poll_on_pk_chunked_batch_status(job_id) batch_status['job_id'] = job_id if batch_status['failed']: raise TapSalesforceException("One or more batches failed during PK chunked job") # Close the job after all the batches are complete self._close_job(job_id) return batch_status
def _bulk_query_with_pk_chunking(self, catalog_entry, start_date): LOGGER.info("Retrying Bulk Query with PK Chunking") # Create a new job job_id = self._create_job(catalog_entry, True) self._add_batch(catalog_entry, job_id, start_date, False) batch_status = self._poll_on_pk_chunked_batch_status(job_id) batch_status['job_id'] = job_id if batch_status['failed']: raise TapSalesforceException( "One or more batches failed during PK chunked job. {} failed out of {} total batches. First 20 failed batches: {}".format( len(batch_status['failed']), len(batch_status['completed']) + len(batch_status['failed']), list(batch_status['failed'].items())[:20])) # Close the job after all the batches are complete self._close_job(job_id) return batch_status
def _query_recur(self, query, catalog_entry, start_date_str, end_date=None, retries=MAX_RETRIES): params = {"q": query} url = "{}/services/data/v52.0/queryAll".format(self.sf.instance_url) headers = self.sf._get_standard_headers() sync_start = singer_utils.now() if end_date is None: end_date = sync_start if retries == 0: raise TapSalesforceException( "Ran out of retries attempting to query Salesforce Object {}". format(catalog_entry['stream'])) retryable = False try: for rec in self._sync_records(url, headers, params): yield rec # If the date range was chunked (an end_date was passed), sync # from the end_date -> now if end_date < sync_start: next_start_date_str = singer_utils.strftime(end_date) query = self.sf._build_query_string(catalog_entry, next_start_date_str) for record in self._query_recur(query, catalog_entry, next_start_date_str, retries=retries): yield record except HTTPError as ex: response = ex.response.json() if isinstance( response, list) and response[0].get("errorCode") == "QUERY_TIMEOUT": start_date = singer_utils.strptime_with_tz(start_date_str) day_range = (end_date - start_date).days LOGGER.info( "Salesforce returned QUERY_TIMEOUT querying %d days of %s", day_range, catalog_entry['stream']) retryable = True else: raise ex if retryable: start_date = singer_utils.strptime_with_tz(start_date_str) half_day_range = (end_date - start_date) // 2 end_date = end_date - half_day_range if half_day_range.days == 0: raise TapSalesforceException( "Attempting to query by 0 day range, this would cause infinite looping." ) query = self.sf._build_query_string( catalog_entry, singer_utils.strftime(start_date), singer_utils.strftime(end_date)) for record in self._query_recur(query, catalog_entry, start_date_str, end_date, retries - 1): yield record
def _query_recur(self, query, catalog_entry, start_date_str, end_date=None, retries=MAX_RETRIES): params = {"q": query} url = "{}/services/data/v41.0/queryAll".format(self.sf.instance_url) headers = self.sf._get_standard_headers() if end_date is None: end_date = singer_utils.now() if retries == 0: raise TapSalesforceException( "Ran out of retries attempting to query Salesforce Object {}". format(catalog_entry['stream'])) retryable = False try: while True: resp = self.sf._make_request('GET', url, headers=headers, params=params) resp_json = resp.json() for rec in resp_json.get('records'): yield rec next_records_url = resp_json.get('nextRecordsUrl') if next_records_url is None: break else: url = "{}{}".format(self.sf.instance_url, next_records_url) except HTTPError as ex: response = ex.response.json() if isinstance( response, list) and response[0].get("errorCode") == "QUERY_TIMEOUT": start_date = singer_utils.strptime_with_tz(start_date_str) day_range = (end_date - start_date).days LOGGER.info( "Salesforce returned QUERY_TIMEOUT querying %d days of %s", day_range, catalog_entry['stream']) retryable = True else: raise ex if retryable: start_date = singer_utils.strptime_with_tz(start_date_str) half_day_range = (end_date - start_date) // 2 end_date = end_date - half_day_range if half_day_range.days == 0: raise TapSalesforceException( "Attempting to query by 0 day range, this would cause infinite looping." ) query = self.sf._build_query_string( catalog_entry, start_date.format("%Y-%m-%dT%H:%M:%SZ"), end_date.format("%Y-%m-%dT%H:%M:%SZ")) for record in self._query_recur(query, catalog_entry, start_date_str, end_date, retries - 1): yield record