Example #1
0
    def _make_request(self,
                      http_method,
                      url,
                      headers=None,
                      body=None,
                      stream=False,
                      params=None):
        if http_method == "GET":
            resp = self.session.get(url,
                                    headers=headers,
                                    stream=stream,
                                    params=params)
        elif http_method == "POST":
            resp = self.session.post(url, headers=headers, data=body)
        else:
            raise TapSalesforceException("Unsupported HTTP method")

        try:
            resp.raise_for_status()
        except RequestException as ex:
            raise ex

        if resp.headers.get("Sforce-Limit-Info") is not None:
            self.rest_requests_attempted += 1
            self.check_rest_quota_usage(resp.headers)

        return resp
Example #2
0
    def _make_request(self,
                      http_method,
                      url,
                      headers=None,
                      body=None,
                      stream=False,
                      params=None):
        if http_method == "GET":
            LOGGER.info("Making %s request to %s with params: %s", http_method,
                        url, params)
            resp = self.session.get(url,
                                    headers=headers,
                                    stream=stream,
                                    params=params)
        elif http_method == "POST":
            LOGGER.info("Making %s request to %s with body %s", http_method,
                        url, body)
            resp = self.session.post(url, headers=headers, data=body)
        else:
            raise TapSalesforceException("Unsupported HTTP method")

        resp.raise_for_status()

        if resp.headers.get('Sforce-Limit-Info') is not None:
            self.rest_requests_attempted += 1
            self.check_rest_quota_usage(resp.headers)

        return resp
Example #3
0
 def get_blacklisted_fields(self):
     if self.api_type == BULK_API_TYPE:
         return {('EntityDefinition', 'RecordTypesSupported'): "this field is unsupported by the Bulk API."}
     elif self.api_type == REST_API_TYPE:
         return {}
     else:
         raise TapSalesforceException(
             "api_type should be REST or BULK was: {}".format(
                 self.api_type))
def field_to_property_schema(field, mdata):
    property_schema = {}

    field_name = field['name']
    sf_type = field['type']

    if sf_type in STRING_TYPES:
        property_schema['type'] = "string"
    elif sf_type in DATE_TYPES:
        date_type = {"type": "string", "format": "date-time"}
        string_type = {"type": ["string", "null"]}
        property_schema["anyOf"] = [date_type, string_type]
    elif sf_type == "boolean":
        property_schema['type'] = "boolean"
    elif sf_type in NUMBER_TYPES:
        property_schema['type'] = "number"
    elif sf_type == "address":
        property_schema['type'] = "object"
        property_schema['properties'] = {
            "street": {"type": ["null", "string"]},
            "state": {"type": ["null", "string"]},
            "postalCode": {"type": ["null", "string"]},
            "city": {"type": ["null", "string"]},
            "country": {"type": ["null", "string"]},
            "longitude": {"type": ["null", "number"]},
            "latitude": {"type": ["null", "number"]},
            "geocodeAccuracy": {"type": ["null", "string"]}
        }
    elif sf_type == "int":
        property_schema['type'] = "integer"
    elif sf_type == "time":
        property_schema['type'] = "string"
    elif sf_type in LOOSE_TYPES:
        return property_schema, mdata  # No type = all types
    elif sf_type in BINARY_TYPES:
        mdata = metadata.write(mdata, ('properties', field_name), "inclusion", "unsupported")
        mdata = metadata.write(mdata, ('properties', field_name),
                               "unsupported-description", "binary data")
        return property_schema, mdata
    elif sf_type == 'location':
        # geo coordinates are numbers or objects divided into two fields for lat/long
        property_schema['type'] = ["number", "object", "null"]
        property_schema['properties'] = {
            "longitude": {"type": ["null", "number"]},
            "latitude": {"type": ["null", "number"]}
        }
    elif sf_type == 'json':
        property_schema['type'] = "string"
    else:
        raise TapSalesforceException("Found unsupported type: {}".format(sf_type))

    # The nillable field cannot be trusted
    if field_name != 'Id' and sf_type != 'location' and sf_type not in DATE_TYPES:
        property_schema['type'] = ["null", property_schema['type']]

    return property_schema, mdata
Example #5
0
 def get_blacklisted_objects(self):
     if self.api_type == BULK_API_TYPE:
         return UNSUPPORTED_BULK_API_SALESFORCE_OBJECTS.union(
             QUERY_RESTRICTED_SALESFORCE_OBJECTS).union(QUERY_INCOMPATIBLE_SALESFORCE_OBJECTS)
     elif self.api_type == REST_API_TYPE:
         return QUERY_RESTRICTED_SALESFORCE_OBJECTS.union(QUERY_INCOMPATIBLE_SALESFORCE_OBJECTS)
     else:
         raise TapSalesforceException(
             "api_type should be REST or BULK was: {}".format(
                 self.api_type))
Example #6
0
 def query(self, catalog_entry, state):
     if self.api_type == BULK_API_TYPE:
         bulk = Bulk(self)
         return bulk.query(catalog_entry, state)
     elif self.api_type == REST_API_TYPE:
         rest = Rest(self)
         return rest.query(catalog_entry, state)
     else:
         raise TapSalesforceException(
             "api_type should be REST or BULK was: {}".format(
                 self.api_type))
Example #7
0
    def _make_request(self,
                      http_method,
                      url,
                      headers=None,
                      body=None,
                      stream=False,
                      params=None):
        request_timeout = 5 * 60  # 5 minute request timeout
        try:
            if http_method == "GET":
                LOGGER.info("Making %s request to %s with params: %s",
                            http_method, url, params)
                resp = self.session.get(
                    url,
                    headers=headers,
                    stream=stream,
                    params=params,
                    timeout=request_timeout,
                )
            elif http_method == "POST":
                LOGGER.info("Making %s request to %s with body %s",
                            http_method, url, body)
                resp = self.session.post(
                    url,
                    headers=headers,
                    data=body,
                    timeout=request_timeout,
                )
            else:
                raise TapSalesforceException("Unsupported HTTP method")
        except requests.exceptions.ConnectionError as connection_err:
            LOGGER.error(
                'Took longer than %s seconds to connect to the server',
                request_timeout)
            raise connection_err
        except requests.exceptions.Timeout as timeout_err:
            LOGGER.error('Took longer than %s seconds to hear from the server',
                         request_timeout)
            raise timeout_err

        try:
            resp.raise_for_status()
        except RequestException as ex:
            raise ex

        if resp.headers.get('Sforce-Limit-Info') is not None:
            self.rest_requests_attempted += 1
            self.check_rest_quota_usage(resp.headers)

        return resp
Example #8
0
    def _bulk_query(self, catalog_entry, state):
        job_id = self._create_job(catalog_entry)
        start_date = self.sf.get_start_date(state, catalog_entry)

        batch_id = self._add_batch(catalog_entry, job_id, start_date)

        self._close_job(job_id)

        batch_status = self._poll_on_batch_status(job_id, batch_id)

        if batch_status["state"] == "Failed":
            if "QUERY_TIMEOUT" in batch_status["stateMessage"]:
                batch_status = self._bulk_query_with_pk_chunking(
                    catalog_entry, start_date)
                job_id = batch_status["job_id"]

                # Set pk_chunking to True to indicate that we should write a bookmark differently
                self.sf.pk_chunking = True

                # Add the bulk Job ID and its batches to the state so it can be resumed if necessary
                tap_stream_id = catalog_entry["tap_stream_id"]
                state = singer.write_bookmark(state, tap_stream_id, "JobID",
                                              job_id)
                state = singer.write_bookmark(state, tap_stream_id, "BatchIDs",
                                              batch_status["completed"][:])

                for completed_batch_id in batch_status["completed"]:
                    for result in self.get_batch_results(
                            job_id, completed_batch_id, catalog_entry):
                        yield result
                    # Remove the completed batch ID and write state
                    state["bookmarks"][catalog_entry["tap_stream_id"]][
                        "BatchIDs"].remove(completed_batch_id)
                    LOGGER.info(
                        "Finished syncing batch %s. Removing batch from state.",
                        completed_batch_id,
                    )
                    LOGGER.info(
                        "Batches to go: %d",
                        len(state["bookmarks"][catalog_entry["tap_stream_id"]]
                            ["BatchIDs"]),
                    )
                    singer.write_state(state)
            else:
                raise TapSalesforceException(batch_status["stateMessage"])
        else:
            for result in self.get_batch_results(job_id, batch_id,
                                                 catalog_entry):
                yield result
Example #9
0
    def _bulk_query_with_pk_chunking(self, catalog_entry, start_date):
        LOGGER.info("Retrying Bulk Query with PK Chunking")

        # Create a new job
        job_id = self._create_job(catalog_entry, True)

        self._add_batch(catalog_entry, job_id, start_date, False)

        batch_status = self._poll_on_pk_chunked_batch_status(job_id)
        batch_status['job_id'] = job_id

        if batch_status['failed']:
            raise TapSalesforceException("One or more batches failed during PK chunked job")

        # Close the job after all the batches are complete
        self._close_job(job_id)

        return batch_status
Example #10
0
    def _bulk_query_with_pk_chunking(self, catalog_entry, start_date):
        LOGGER.info("Retrying Bulk Query with PK Chunking")

        # Create a new job
        job_id = self._create_job(catalog_entry, True)

        self._add_batch(catalog_entry, job_id, start_date, False)

        batch_status = self._poll_on_pk_chunked_batch_status(job_id)
        batch_status['job_id'] = job_id

        if batch_status['failed']:
            raise TapSalesforceException(
                "One or more batches failed during PK chunked job. {} failed out of {} total batches. First 20 failed batches: {}".format(
                    len(batch_status['failed']),
                    len(batch_status['completed']) + len(batch_status['failed']),
                    list(batch_status['failed'].items())[:20]))

        # Close the job after all the batches are complete
        self._close_job(job_id)

        return batch_status
Example #11
0
    def _query_recur(self,
                     query,
                     catalog_entry,
                     start_date_str,
                     end_date=None,
                     retries=MAX_RETRIES):
        params = {"q": query}
        url = "{}/services/data/v52.0/queryAll".format(self.sf.instance_url)
        headers = self.sf._get_standard_headers()

        sync_start = singer_utils.now()
        if end_date is None:
            end_date = sync_start

        if retries == 0:
            raise TapSalesforceException(
                "Ran out of retries attempting to query Salesforce Object {}".
                format(catalog_entry['stream']))

        retryable = False
        try:
            for rec in self._sync_records(url, headers, params):
                yield rec

            # If the date range was chunked (an end_date was passed), sync
            # from the end_date -> now
            if end_date < sync_start:
                next_start_date_str = singer_utils.strftime(end_date)
                query = self.sf._build_query_string(catalog_entry,
                                                    next_start_date_str)
                for record in self._query_recur(query,
                                                catalog_entry,
                                                next_start_date_str,
                                                retries=retries):
                    yield record

        except HTTPError as ex:
            response = ex.response.json()
            if isinstance(
                    response,
                    list) and response[0].get("errorCode") == "QUERY_TIMEOUT":
                start_date = singer_utils.strptime_with_tz(start_date_str)
                day_range = (end_date - start_date).days
                LOGGER.info(
                    "Salesforce returned QUERY_TIMEOUT querying %d days of %s",
                    day_range, catalog_entry['stream'])
                retryable = True
            else:
                raise ex

        if retryable:
            start_date = singer_utils.strptime_with_tz(start_date_str)
            half_day_range = (end_date - start_date) // 2
            end_date = end_date - half_day_range

            if half_day_range.days == 0:
                raise TapSalesforceException(
                    "Attempting to query by 0 day range, this would cause infinite looping."
                )

            query = self.sf._build_query_string(
                catalog_entry, singer_utils.strftime(start_date),
                singer_utils.strftime(end_date))
            for record in self._query_recur(query, catalog_entry,
                                            start_date_str, end_date,
                                            retries - 1):
                yield record
Example #12
0
    def _query_recur(self,
                     query,
                     catalog_entry,
                     start_date_str,
                     end_date=None,
                     retries=MAX_RETRIES):
        params = {"q": query}
        url = "{}/services/data/v41.0/queryAll".format(self.sf.instance_url)
        headers = self.sf._get_standard_headers()

        if end_date is None:
            end_date = singer_utils.now()

        if retries == 0:
            raise TapSalesforceException(
                "Ran out of retries attempting to query Salesforce Object {}".
                format(catalog_entry['stream']))

        retryable = False
        try:
            while True:
                resp = self.sf._make_request('GET',
                                             url,
                                             headers=headers,
                                             params=params)
                resp_json = resp.json()

                for rec in resp_json.get('records'):
                    yield rec

                next_records_url = resp_json.get('nextRecordsUrl')

                if next_records_url is None:
                    break
                else:
                    url = "{}{}".format(self.sf.instance_url, next_records_url)

        except HTTPError as ex:
            response = ex.response.json()
            if isinstance(
                    response,
                    list) and response[0].get("errorCode") == "QUERY_TIMEOUT":
                start_date = singer_utils.strptime_with_tz(start_date_str)
                day_range = (end_date - start_date).days
                LOGGER.info(
                    "Salesforce returned QUERY_TIMEOUT querying %d days of %s",
                    day_range, catalog_entry['stream'])
                retryable = True
            else:
                raise ex

        if retryable:
            start_date = singer_utils.strptime_with_tz(start_date_str)
            half_day_range = (end_date - start_date) // 2
            end_date = end_date - half_day_range

            if half_day_range.days == 0:
                raise TapSalesforceException(
                    "Attempting to query by 0 day range, this would cause infinite looping."
                )

            query = self.sf._build_query_string(
                catalog_entry, start_date.format("%Y-%m-%dT%H:%M:%SZ"),
                end_date.format("%Y-%m-%dT%H:%M:%SZ"))
            for record in self._query_recur(query, catalog_entry,
                                            start_date_str, end_date,
                                            retries - 1):
                yield record