Python SalesforceBulk.batch_state Examples

Programming Language: Python

Namespace/Package Name: salesforce_bulk

Class/Type: SalesforceBulk

Method/Function: batch_state

Examples at hotexamples.com: 2

Python SalesforceBulk.batch_state - 2 examples found. These are the top rated real world Python examples of salesforce_bulk.SalesforceBulk.batch_state extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SalesforceBulk(30)

batch_state(2)

abort_job(1)

get_batch_result_ids(1)

Example #1

Show file

File: fetcher.py Project: mozilla-it/salesforce-fetcher

class SalesforceFetcher(object):
    """
    Class that encapsulates all the fetching logic for SalesForce.
    """
    def __init__(self, config_path):
        """
        Bootstrap a fetcher class
        :param config_path: Path to the configuration file to use for this instance
        """
        # Get settings
        with open(config_path, 'r') as f:
            self.settings = yaml.safe_load(f)

        # Configure the logger
        log_level = (logging.WARN, logging.DEBUG)[self.settings['debug']]
        LOG_FORMAT = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        logger = logging.getLogger("salesforce-fetcher")
        logger.setLevel(log_level)

        ch = logging.StreamHandler()
        ch.setFormatter(LOG_FORMAT)
        logger.addHandler(ch)

        logger.debug("Logging is set to DEBUG level")
        # let's not output the password
        #logger.debug("Settings: %s" % self.settings)

        self.logger = logger
        self.salesforce = Salesforce(**self.settings['salesforce']['auth'])
        self.salesforce_bulk = SalesforceBulk(**self.settings['salesforce']
                                              ['auth'],
                                              API_version='46.0')

        # Make sure output dir is created
        output_directory = self.settings['output_dir']
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)

    def fetch_all(self, fetch_only, airflow_date, fetch_method, days_lookback):
        """
        Fetch any reports or queries, writing them out as files in the output_dir
        """
        queries = self.load_queries()
        for name, query in queries.items():
            if fetch_only and name != fetch_only:
                self.logger.debug(
                    "'--fetch-only %s' specified. Skipping fetch of %s" %
                    (fetch_only, name))
                continue
            #if name == 'contacts' or name == 'opportunity':
            if fetch_method and fetch_method == 'bulk':
                self.fetch_soql_query_bulk(name, query, airflow_date)
            else:
                self.fetch_soql_query(name, query, airflow_date)

        reports = self.settings['salesforce']['reports']
        for name, report_url in reports.items():
            if fetch_only and name != fetch_only:
                self.logger.debug(
                    "'--fetch-only %s' specified. Skipping fetch of %s" %
                    (fetch_only, name))
                continue
            self.fetch_report(name, report_url, airflow_date)

        if fetch_only:
            if fetch_only == 'contact_deletes':
                self.fetch_contact_deletes(days=days_lookback,
                                           airflow_date=airflow_date)
        else:
            self.fetch_contact_deletes(days=days_lookback,
                                       airflow_date=airflow_date)

        self.logger.info("Job Completed")

    def fetch_contact_deletes(self, days=29, airflow_date=None):
        """
        Fetches all deletes from Contact for X days
        :param days: Fetch deletes from this number of days to present
        :return:
        """
        path = self.create_output_path('contact_deletes',
                                       airflow_date=airflow_date)
        end = datetime.datetime.now(
            pytz.UTC)  # we need to use UTC as salesforce API requires this!
        records = self.salesforce.Contact.deleted(
            end - datetime.timedelta(days=days), end)
        data_list = records['deletedRecords']
        if len(data_list) > 0:
            fieldnames = list(data_list[0].keys())
            with open(path, 'w') as f:
                writer = DictWriter(f,
                                    fieldnames=fieldnames,
                                    quoting=QUOTE_ALL)
                writer.writeheader()
                for delta_record in data_list:
                    writer.writerow(delta_record)

    def fetch_report(self, name, report_url, airflow_date=None):
        """
        Fetches a single prebuilt Salesforce report via an HTTP request
        :param name: Name of the report to fetch
        :param report_url: Base URL for the report
        :return:
        """

        self.logger.info("Fetching report - %s" % name)
        sf_host = self.settings['salesforce']['host']
        url = "%s%s?view=d&snip&export=1&enc=UTF-8&xf=csv" % (sf_host,
                                                              report_url)

        resp = requests.get(url,
                            headers=self.salesforce.headers,
                            cookies={'sid': self.salesforce.session_id},
                            stream=True)

        path = self.create_output_path(name, airflow_date=airflow_date)
        with open(path, 'w+') as f:
            # Write the full contents
            f.write(resp.text.replace("\"", ""))

            # Remove the Salesforce footer (last 7 lines)
            f.seek(0, os.SEEK_END)
            pos = f.tell() - 1

            count = 0
            while pos > 0 and count < 7:
                pos -= 1
                f.seek(pos, os.SEEK_SET)
                if f.read(1) == "\n":
                    count += 1

            # So long as we're not at the start of the file, delete all the characters ahead of this position
            if pos > 0:
                # preserve the last newline then truncate the file
                pos += 1
                f.seek(pos, os.SEEK_SET)
                f.truncate()

    def fetch_soql_query_bulk(self, name, query, airflow_date=None):
        self.logger.info("BULK Executing %s" % name)
        self.logger.info("BULK Query is: %s" % query)
        if name == 'contacts' or name == 'contact_updates':
            table_name = 'Contact'
        elif name == 'opportunity' or name == 'opportunity_updates':
            table_name = 'Opportunity'
        job = self.salesforce_bulk.create_query_job(table_name,
                                                    contentType='CSV',
                                                    pk_chunking=True,
                                                    concurrency='Parallel')
        self.logger.info("job: %s" % job)
        batch = self.salesforce_bulk.query(job, query)
        #        job = '7504O00000LUxuCQAT'
        #        batch = '7514O00000TvapeQAB'
        self.logger.info("Bulk batch created: %s" % batch)

        while True:
            batch_state = self.salesforce_bulk.batch_state(
                batch, job_id=job, reload=True).lower()
            if batch_state == 'notprocessed':
                self.logger.info("master batch is done")
                break
            elif batch_state == 'aborted' or batch_state == 'failed':
                self.logger.error("master batch failed")
                self.logger.error(
                    self.salesforce_bulk.batch_status(batch_id=batch,
                                                      job_id=job,
                                                      reload=True))
                raise Exception("master batch failed")
            self.logger.info("waiting for batch to be done. status=%s" %
                             batch_state)
            time.sleep(10)

        count = 0
        downloaded = {}

        pool = mp.Pool(5)

        while True:
            stats = {}
            batch_count = 0
            all_batches = self.salesforce_bulk.get_batch_list(job)
            for batch_info in all_batches:
                batch_count += 1

                batch_state = batch_info['state'].lower()
                if batch_state in stats:
                    stats[batch_state] += 1
                else:
                    stats[batch_state] = 1

                if batch_info['id'] == batch:
                    #self.logger.debug("skipping the master batch id")
                    continue
                elif batch_info['id'] in downloaded:
                    #self.logger.debug("batch %s already downloaded" % batch_info['id'])
                    continue

                if batch_state == 'completed':
                    self.logger.debug(
                        "batch %s (%s of %s)" %
                        (batch_info['id'], batch_count, len(all_batches)))

                    for result_id in self.salesforce_bulk.get_query_batch_result_ids(
                            batch_info['id'], job_id=job):
                        self.logger.debug("result_id: %s" % result_id)
                        path = self.create_output_path(
                            name, result_id, airflow_date=airflow_date)
                        pool.apply_async(
                            get_and_write_bulk_results,
                            args=(batch_info['id'], result_id, job,
                                  self.salesforce_bulk.endpoint,
                                  self.salesforce_bulk.headers(), path))

                    downloaded[batch_info['id']] = 1

                elif batch_state == 'failed':
                    downloaded[batch_info['id']] = 1
                    self.logger.error("batch %s failed!" % batch_info['id'])
                    self.logger.error(
                        self.salesforce_bulk.batch_status(
                            batch_id=batch_info['id'], job_id=job,
                            reload=True))

            if 'completed' in stats and stats['completed'] + 1 == batch_count:
                self.logger.info("all batches retrieved")
                break
            elif 'failed' in stats and stats['failed'] + 1 == batch_count:
                self.logger.error("NO batches retrieved")
                self.logger.error(
                    self.salesforce_bulk.batch_status(batch_id=batch,
                                                      job_id=job,
                                                      reload=True))
                raise Exception("NO batches retrieved")
            elif 'failed' in stats and stats['failed'] + stats[
                    'completed'] == batch_count:
                self.logger.warning("all batches WITH SOME FAILURES")
                break
            else:
                self.logger.info(stats)
                time.sleep(5)

        try:
            self.salesforce_bulk.close_job(job)
        except:
            pass
        pool.close()
        pool.join()

    def fetch_soql_query(self, name, query, airflow_date=None):
        self.logger.info("Executing %s" % name)
        self.logger.info("Query is: %s" % query)
        path = self.create_output_path(name, airflow_date=airflow_date)
        result = self.salesforce.query(query)
        self.logger.info("First result set received")
        batch = 0
        count = 0
        if result['records']:
            fieldnames = list(result['records'][0].keys())
            fieldnames.pop(0)  # get rid of attributes
            with open(path, 'w') as f:
                writer = DictWriter(f,
                                    fieldnames=fieldnames,
                                    quoting=QUOTE_ALL)
                writer.writeheader()

                while True:
                    batch += 1
                    for row in result['records']:
                        # each row has a strange attributes key we don't want
                        row.pop('attributes', None)
                        out_dict = {}
                        for key, value in row.items():
                            if type(value) is collections.OrderedDict:
                                out_dict[key] = json.dumps(value)
                            else:
                                out_dict[key] = value
                        writer.writerow(out_dict)
                        count += 1
                        if count % 100000 == 0:
                            self.logger.debug("%s rows fetched" % count)

                    # fetch next batch if we're not done else break out of loop
                    if not result['done']:
                        result = self.salesforce.query_more(
                            result['nextRecordsUrl'], True)
                    else:
                        break

        else:
            self.logger.warn("No results returned for %s" % name)

    def create_output_path(self, name, filename='output', airflow_date=None):
        output_dir = self.settings['output_dir']
        if airflow_date:
            date = airflow_date
        else:
            date = time.strftime("%Y-%m-%d")
        child_dir = os.path.join(output_dir, name, date)
        if not os.path.exists(child_dir):
            os.makedirs(child_dir)

        filename = filename + ".csv"
        file_path = os.path.join(child_dir, filename)
        self.logger.info("Writing output to %s" % file_path)
        return file_path

    def create_custom_query(self,
                            table_name='Contact',
                            dir='/usr/local/salesforce_fetcher/queries',
                            updates_only=False):
        """
        The intention is to have Travis upload the "contact_fields.yaml" file
        to a bucket where it can be pulled down dynamically by this script
        and others (instead of having to rebuild the image on each change)
        """

        fields_file_name = table_name.lower() + '_fields.yaml'
        fields_file = os.path.join(dir, fields_file_name)
        if not os.path.exists(fields_file):
            return
        with open(fields_file, 'r') as stream:
            columns = yaml.safe_load(stream)

        query = "SELECT "
        for field in columns['fields']:
            query += next(iter(field)) + ', '

        query = query[:-2] + " FROM " + table_name
        if updates_only:
            query += " WHERE LastModifiedDate >= LAST_N_DAYS:3"

        return query

    def load_queries(self):
        """
        load queries from an external directory
        :return: a dict containing all the SOQL queries to be executed
        """
        queries = {}

        query_dir = self.settings['salesforce']['query_dir']
        for file in os.listdir(query_dir):
            if file.endswith(".soql"):
                name, ext = os.path.splitext(file)
                query_file = os.path.join(query_dir, file)
                with open(query_file, 'r') as f:
                    queries[name] = f.read().strip().replace('\n', ' ')

        # explicitly add the non-file queries
        queries['contacts'] = self.create_custom_query(table_name='Contact',
                                                       dir=query_dir)
        queries['contact_updates'] = self.create_custom_query(
            table_name='Contact', dir=query_dir, updates_only=True)
        queries['opportunity'] = self.create_custom_query(
            table_name='Opportunity', dir=query_dir)
        queries['opportunity_updates'] = self.create_custom_query(
            table_name='Opportunity', dir=query_dir, updates_only=True)

        return queries

Example #2

Show file

File: sf_tools.py Project: equinoxfitness/datacoco-sf_tools

class SFInteraction(object):
    """Simple class that interacts with Salesforce"""
    def __init__(self,
                 username,
                 password,
                 token,
                 uat,
                 project_id='datacoco3.db',
                 session_id=None,
                 instance=None,
                 version=None):
        """Instantiate a Salesforce interaction manager.

        UAT mode is explicitly set to a boolean value in case a string is provided.

        If Salesforce session credentials do not exist, attempt to retrieve.

        :param username: Username
        :param password: Password
        :param token: API token
        :param uat: Whether or not in UAT mode
        :param project_id: to identify project source api calls
        :param session_id:  Access token for existing session
        :param instance: Domain of Salesforce instance
        """
        if not username or not password or not token or uat is None:
            raise RuntimeError("%s request all __init__ arguments" % __name__)

        self.username = username
        self.password = password
        self.token = token
        self.session_id = session_id
        self.instance = instance
        self.project_id = project_id
        self.version = version

        self.conn = None
        self.bulk = None
        self.job_id = None
        self.batch_max_attempts = None
        self.batch_timeout = None
        self.batch_sleep_interval = None
        self.temp_file = "sf_temp_results.txt"

        self.redis_conn = None
        self.session_credentials_key = "sf_session_credentials"

        # Handle string uat which should be boolean
        if isinstance(uat, str) and uat.lower() in ("true", "t"):
            self.uat = True
        else:
            self.uat = False

        # Retrieve session_id and/or instance if they do not exist
        if not self.session_id or not self.instance:
            self._get_session_credentials()

    def connect(self):
        """Connect to the Salesforce API client.

        Only executes if there is not an existing open Salesforce connection.

        If there are a session_id and an instance, attempt to connect to
        the existing session. The existing session connection is verified with a Salesforce API describe call.

        If that fails, create a new connection.

        There are 3 retry attempts
        """
        if self.session_id and self.instance:
            retry_count = 1
            while True:
                if retry_count > 3:
                    LOG.l(
                        "Could not connect to Salesforce in the specified number of retries."
                    )
                    LOG.l("Starting a new connection...")
                    break
                else:
                    LOG.l(
                        f"Connecting to Salesforce: attempt {retry_count} of 3..."
                    )

                try:
                    self.conn = Salesforce(session_id=self.session_id,
                                           instance=self.instance,
                                           client_id=self.project_id,
                                           version=self.version)
                    self.conn.describe()  # Connection health check
                    return  #Sucess, leave this function
                except SalesforceError as sfe:
                    LOG.l(
                        f"Encountered error connecting to Salesforce:\n{sfe}")
                    retry_count += 1
                    sleep(5)
                    continue

        #If reconnecting didn't work or session_id is not set, then start a new connection
        try:
            self._create_new_connection()
        except:
            raise Exception("Could not initiate connection to Salesforce!")

    def fetch_soql(
        self,
        db_table,
        soql,
        batch=True,
        batch_timeout=600,
        batch_sleep_int=10,
        batch_max_attempts=1,
    ):
        """Fetch results from Salesforce soql queries.

        Batch Salesforce queries results saved to a file and retrieved because they are in CSV format
        and to avoid bulk queries timeouts.

        :param db_table: Database table name
        :param soql: Soql queries
        :param batch: Whether to use Salesforce Batch or Simple API
        :param batch_sleep_int: Salesforce Bulk query sleep interval
        :param batch_timeout: Batch job timeout in seconds
        :param batch_max_attempts: Maximum number of batch query creation attempts
        :return: If success, List of result dictionaries; Else empty list
        """
        try:
            if batch:
                # Set batch operation attributes
                self.batch_timeout = batch_timeout
                self.batch_sleep_interval = batch_sleep_int
                self.batch_max_attempts = batch_max_attempts

                results = self.get_query_records_dict(db_table, soql)

                # save to and read from file to avoid connection timeout
                self._save_results_to_file(results)
                records = self._get_results_from_file()
            else:
                result = self.conn.query(soql)

                # if there isn't a result return an empty list
                if result["records"]:
                    salesforce_records = json.loads(
                        json.dumps(result["records"][0]))
                    parsed_records = parse_sf_records(salesforce_records)
                    records = [parsed_records
                               ]  # put result in a list object for consistency
                else:
                    records = []
        except BulkApiError as e:
            self.bulk.abort_job(self.job_id)
            # TODO Handle failed bulk API transaction better
            raise e

        return records

    def get(self, object_name: str, object_id: str):
        """
        To get a dictionary with all the information regarding that record
        """
        return self.conn.__getattr__(object_name).get(object_id)

    def get_by_custom_id(self, object_name: str, field: str, id: str):
        """
        To get a dictionary with all the information regarding that record
        using a **custom** field that was defined as External ID:
        """
        return self.conn.__getattr__(object_name).get_by_custom_id(field, id)

    def upsert(self, object_name: str, field: str, id: str, data: dict):
        """
        To insert or update (upsert) a record using an external ID
        """
        return self.conn.__getattr__(object_name).upsert(f'{field}/{id}', data)

    def get_query_records_dict(self, db_table, soql_query):
        """Execute bulk Salesforce soql queries and return results as generator of dictionaries.

        :param db_table: Database table name
        :param soql_query: Soql queries
        :return: If success, List of result record dictionaries; Else empty list
        """
        self.bulk = SalesforceBulk(sessionId=self.session_id,
                                   host=self.instance)
        job = self.bulk.create_query_job(db_table, contentType="JSON")
        batch = self.bulk.query(job, soql_query)
        self.bulk.close_job(job)
        while not self.bulk.is_batch_done(batch):
            print("Waiting for batch query to complete")
            sleep(10)

        dict_records = []
        rec_count = 0
        print("Iterating through batch result set")
        for result in self.bulk.get_all_results_for_query_batch(batch):
            result = json.load(IteratorBytesIO(result))
            for row in result:
                rec_count += 1
                dict_records.append(row)
            print("Current fetched record count: ", rec_count)

        return dict_records

    def batch_query_records_dict(self,
                                 db_table,
                                 soql_query,
                                 concurrency='Serial'):
        """Execute bulk Salesforce soql queries and return results as generator of dictionaries.

        works only for PK CHUNKING enabled SF tables.

        Allows millions of record read.

        :param db_table: Database table name
        :param soql_query: Soql queries
        :return: If success, List of result record dictionaries; Else empty list
        """
        self.bulk = SalesforceBulk(sessionId=self.session_id,
                                   host=self.instance)
        job = self.bulk.create_query_job(db_table,
                                         contentType="JSON",
                                         pk_chunking=True,
                                         concurrency=concurrency)
        try:
            batch = self.bulk.query(job, soql_query)
            batch_list = self.bulk.get_batch_list(job)
            print('first batch', batch_list[0])
            batch_id = batch_list[0]['id']
            job_id = batch_list[0]['jobId']
            state = batch_list[0]['state']
            while state == 'Queued' or state == 'InProgress':
                print(
                    "Waiting for batch state Queued or InProgress to change " +
                    state)
                sleep(10)
                state = self.bulk.batch_state(batch_id, job_id)

            batch_list = self.bulk.get_batch_list(job)
            print(f'number of batches: {len(batch_list)}')
            for item in batch_list:
                print('item', item)
                batch_id = item['id']
                job_id = item['jobId']
                state = item['state']

                if state == 'NotProcessed':
                    continue

                while not self.bulk.is_batch_done(batch_id, job_id):
                    print(
                        f"Waiting for batch query to complete batch_id:{batch_id}, job_id: {job_id}, state: {state}"
                    )
                    sleep(10)
                    state = self.bulk.batch_state(batch_id, job_id)

                total_retry_count = len(batch_list)
                retry = len(batch_list)
                lastIndex = 0
                while retry > 0:
                    print(f'retry {retry} times left')
                    try:
                        for result in list(
                                self.bulk.get_all_results_for_query_batch(
                                    batch_id, job_id))[lastIndex:]:
                            result = json.load(IteratorBytesIO(result))
                            lastIndex += 1
                            yield result
                        break
                    except requests.exceptions.ChunkedEncodingError as e:
                        print('Chunking failed')
                        retry -= 1
                        self.connect()
                        self.bulk = SalesforceBulk(sessionId=self.session_id,
                                                   host=self.instance)
                        pass
                    except Exception as e:
                        print('There was an error')
                        traceback.print_exc()
                        retry -= 1
                        self.connect()
                        self.bulk = SalesforceBulk(sessionId=self.session_id,
                                                   host=self.instance)
                        pass
                if retry <= 0:
                    raise Exception(
                        f'Retried {total_retry_count} times and it still failed'
                    )
        except BulkApiError as e:
            self.bulk.abort_job(self.job_id)
            raise e

    def upload_records_to_s3(self, records, s3_bucket, s3_key, aws_access_key,
                             aws_secret_key):
        """Upload records to s3.

        :param records: Records filename
        """
        self._save_results_to_file(records)
        datetime_today = datetime.today().strftime("%Y-%m-%d-%X")

        s3_dest_key = s3_key + datetime_today

        s3_interaction = S3Interaction(aws_access_key, aws_secret_key)
        s3_interaction.put_file_to_s3(s3_bucket, s3_dest_key, self.temp_file)

        return s3_dest_key

    def get_description(self, object_name):
        """Retrieves object description

        :param object_name: Salesforce object/table name
        """
        retry = True
        while retry:
            try:
                return self.conn.__getattr__(object_name).describe()
            except SalesforceError as sfe:
                retry = self._sf_except_reconnect(sfe)

    def _sf_except_reconnect(self, e):
        """ Used in try/catch blocks to reinit the connection

        returns true if the code should be retried, false if no connection could be made
        """
        LOG.l(f"Encountered error:\n{e}")
        try:
            self.connect()
            return True
        except Exception:
            return False

    def _create_new_connection(self):
        """Create a new Salesforce API client connection.

        After the connection is created, the Salesforce session credentials are stored externally.
        """
        self.conn = Salesforce(username=self.username,
                               password=self.password,
                               security_token=self.token,
                               sandbox=self.uat,
                               client_id=self.project_id)
        self.session_id = str(self.conn.session_id)
        self.instance = str(self.conn.sf_instance)
        self._set_session_credentials()

    def _save_results_to_file(self, records):
        """Save Salesforce Bulk API results to a temp file.

        :param records: Records to save
        """
        with open(self.temp_file, "w") as f:
            for r in records:
                f.write("\n")
                f.write(str(str(r).encode("utf-8")))

    def _get_results_from_file(self):
        """Get Salesforce Bulk API results from a temp file.

        The records must be parsed. After the results are retrieved. The file is deleted.

        :return: Iterator with records.
        """
        results = []
        with open(self.temp_file, "r") as f:
            records = f.read()[1:].splitlines()
            for r in records:
                r = ast.literal_eval(r)
                results.append(r)
        os.remove(self.temp_file)
        return results

    def _get_session_credentials(self):
        """Get Salesforce session credentials stored in Redis.

        If the credentials variables do not exist, set the credentials as None.
        """
        # Establish connection to Redis
        self._connect_to_redis()
        # Get salesforce credentials if exists
        if self.redis_conn.conn.exists(self.session_credentials_key):
            self.session_id = self.redis_conn.fetch_by_key_name(
                self.session_credentials_key, "session_id")
            self.instance = self.redis_conn.fetch_by_key_name(
                self.session_credentials_key, "instance")
        else:
            self.session_id = None
            self.instance = None

    def _set_session_credentials(self):
        """Set Salesforce session credentials in Redis.

        """
        sf_session_credentials = {
            "session_id": self.session_id,
            "instance": self.instance,
        }
        self.redis_conn.set_key(self.session_credentials_key,
                                sf_session_credentials)

    def _connect_to_redis(self):
        """Connect to Redis.

        """
        CONF = config()
        host = CONF["redis"]["server"]
        port = CONF["redis"]["port"]
        db = CONF["redis"]["db"]
        self.redis_conn = RedisInteraction(host, port, db)
        self.redis_conn.connect()