Esempio n. 1
0
    def get_sf_object(self, sf_fields, sf_conn_id, sf_object):
        # Sign into Salesforce
        sf_conn = SalesforceHook(conn_id=sf_conn_id)
        sf_conn.sign_in()

        fields = [field['sf_name'] for field in sf_fields]

        logging.info("Making request for {0} fields from {1}".format(
            len(fields), sf_object))

        query = sf_conn.get_object_from_salesforce(sf_object, fields)
        return query
Esempio n. 2
0
    def get_sf_object_cols(self, sf_conn_id, sf_object, sf_fields):
        """
        Uses Salesforce describe() method to fetch columns from
        Salesforce instance. Compound columns are filtered out.
        """
        sf_conn = SalesforceHook(sf_conn_id).sign_in()
        logging.info("Signing in successfully. Fetching SFDC columns")

        # Dynamically Fetch the simple_salesforce query method
        # ie. sf_conn.Lead.describe() | sf_conn.Contact.describe()
        fields = sf_conn.__getattr__(sf_object).describe()['fields']

        # Get compound fields & Links
        k1 = 'compoundFieldName'
        compound_fields = [f[k1] for f in fields
                           ]  # Get all compound fields across all fields
        compound_fields = set(compound_fields)
        compound_fields.remove(None)

        def build_dict(x):
            return {
                'rs_name': x['name'].lower(),
                'sf_name': x['name'],
                'path': [x['name']],
                'type': x['soapType'].split(':')[-1],
                'length': x['length'],
                'precision': x['precision']
            }

        # Loop through fields and grab columns we want
        if sf_fields:
            fields = [
                build_dict(field) for field in fields
                if field['name'] in sf_fields
                if field['name'] not in compound_fields
            ]
        else:
            fields = [
                build_dict(field) for field in fields
                if field['name'] not in compound_fields
            ]

        dummy = [name['sf_name'] for name in fields]
        logging.info(dummy)
        return fields
def airflow_extract_data(
    cms_info,
    drive_credentials,
    start_date,
    get_member_xcom_args,
    execution_date,
    **kwargs,
) -> List[str]:
    """Extracts data from Salesforce that was modified between the start date
    and the execution date.
    """

    member_id = kwargs["task_instance"].xcom_pull(**get_member_xcom_args)

    CONN_ID = cms_info["connection_id"]
    QUERIES = cms_info["queries"]

    sf_hook: SalesforceHook = SalesforceHook(conn_id=CONN_ID)
    docs_service = drive.get_google_docs_service(drive_credentials)

    logging.info("execution_date")
    logging.info(execution_date)

    start_datetime, end_datetime = dates.airflow_get_date_range(
        member_id, start_date, execution_date)

    logging.info("Calculated start_datetime")
    logging.info(start_datetime)
    logging.info("Calculated end_datetime")
    logging.info(end_datetime)

    filenames: List[str] = []

    for document_id in QUERIES:
        # Load the query and replace the dates
        query: str = drive.load_doc_as_query(docs_service, document_id)

        template = jinja2.Template(query)
        query_with_dates: str = template.render(
            start_datetime=f"{start_datetime}Z",
            end_datetime=f"{end_datetime}Z")

        logging.info("Query with populated WHERE clause:")
        logging.info(query_with_dates)

        # Get the data and write it to a .csv
        records_dataframe: pd.DataFrame = _get_sf_records(
            sf_hook, query_with_dates)

        if not records_dataframe.empty:
            tf = tempfile.NamedTemporaryFile(delete=False)
            records_dataframe.to_csv(tf.name, index=False)
            filenames.append(tf.name)

    return filenames
    def execute(self, context):
        sf_conn = SalesforceHook(self.sf_conn_id).get_conn()

        logging.info(self.soql)
        query_results = sf_conn.bulk.__getattr__(self.object).query(self.soql)

        s3 = S3Hook(self.s3_conn_id)
        # One JSON Object Per Line
        query_results = [json.dumps(result, ensure_ascii=False) for result in query_results]
        query_results = '\n'.join(query_results)

        s3.load_string(query_results, self.s3_key, bucket_name=self.s3_bucket, replace=True)
Esempio n. 5
0
    def execute(self, context):
        """
        Execute the operator.
        This will get all the data for a particular Salesforce model
        and write it to a file.

        """
        logging.info("Prepping to gather data from Salesforce")

        # load the SalesforceHook
        # this is what has all the logic for
        # conencting and getting data from Salesforce
        hook = SalesforceHook(conn_id=self.conn_id, output=self.output)

        # attempt to login to Salesforce
        # if this process fails, it will raise an error and die right here
        # we could wrap it
        hook.sign_in()

        # get object from salesforce
        # if fields were not defined,
        # then we assume that the user wants to get all of them
        if not self.fields:
            self.fields = hook.get_available_fields(self.object)

        logging.info("Making request for"
                     "{0} fields from {1}".format(len(self.fields),
                                                  self.object))

        if self.query:
            query = self.special_query(
                self.query, hook, relationship_object=self.relationship_object)
        else:
            query = hook.get_object_from_salesforce(self.object, self.fields)

        # output the records from the query to a file
        # the list of records is stored under the "records" key
        logging.info("Writing query results to: {0}".format(self.output))
        hook.write_object_to_file(query['records'],
                                  filename=self.output,
                                  fmt=self.fmt,
                                  coerce_to_timestamp=self.coerce_to_timestamp,
                                  record_time_added=self.record_time_added)

        logging.info("Query finished!")
Esempio n. 6
0
    def execute(self, context):
        """
        Execute the operator.
        This will get all the data for a particular Salesforce model
        and write it to a file.
        """
        logging.info("Prepping to gather data from Salesforce")

        # Open a name temporary file to store output file until S3 upload
        with NamedTemporaryFile("w") as tmp:

            # Load the SalesforceHook
            hook = SalesforceHook(conn_id=self.sf_conn_id, output=tmp.name)

            # Attempt to login to Salesforce
            # If this process fails, it will raise an error and die.
            try:
                hook.sign_in()
            except:
                logging.debug('Unable to login.')

            # Get object from Salesforce
            # If fields were not defined, all fields are pulled.
            if not self.fields:
                self.fields = hook.get_available_fields(self.object)

            logging.info("Making request for "
                         "{0} fields from {1}".format(len(self.fields),
                                                      self.object))

            if self.query:
                query = self.special_query(
                    self.query,
                    hook,
                    relationship_object=self.relationship_object)
            else:
                query = hook.get_object_from_salesforce(
                    self.object, self.fields)

            # output the records from the query to a file
            # the list of records is stored under the "records" key
            logging.info("Writing query results to: {0}".format(tmp.name))

            hook.write_object_to_file(
                query['records'],
                filename=tmp.name,
                fmt=self.fmt,
                coerce_to_timestamp=self.coerce_to_timestamp,
                record_time_added=self.record_time_added)

            # Flush the temp file and upload temp file to S3
            tmp.flush()

            dest_s3 = S3Hook(s3_conn_id=self.s3_conn_id)

            dest_s3.load_file(filename=tmp.name,
                              key=self.output,
                              bucket_name=self.s3_bucket,
                              replace=True)

            dest_s3.connection.close()

            tmp.close()

        logging.info("Query finished!")
 def setUp(self):
     self.salesforce_hook = SalesforceHook(conn_id='conn_id')
class TestSalesforceHook(unittest.TestCase):

    def setUp(self):
        self.salesforce_hook = SalesforceHook(conn_id='conn_id')

    def test_get_conn_exists(self):
        self.salesforce_hook.conn = Mock(spec=Salesforce)

        self.salesforce_hook.get_conn()

        self.assertIsNotNone(self.salesforce_hook.conn.return_value)

    @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.get_connection',
           return_value=Connection(
               login='******',
               password='******',
               extra='{"security_token": "token", "sandbox": "true"}'
           ))
    @patch('airflow.contrib.hooks.salesforce_hook.Salesforce')
    def test_get_conn(self, mock_salesforce, mock_get_connection):
        self.salesforce_hook.get_conn()

        self.assertEqual(self.salesforce_hook.conn, mock_salesforce.return_value)
        mock_salesforce.assert_called_once_with(
            username=mock_get_connection.return_value.login,
            password=mock_get_connection.return_value.password,
            security_token=mock_get_connection.return_value.extra_dejson['security_token'],
            instance_url=mock_get_connection.return_value.host,
            sandbox=mock_get_connection.return_value.extra_dejson.get('sandbox', False)
        )

    @patch('airflow.contrib.hooks.salesforce_hook.Salesforce')
    def test_make_query(self, mock_salesforce):
        mock_salesforce.return_value.query_all.return_value = dict(totalSize=123, done=True)
        self.salesforce_hook.conn = mock_salesforce.return_value
        query = 'SELECT * FROM table'

        query_results = self.salesforce_hook.make_query(query)

        mock_salesforce.return_value.query_all.assert_called_once_with(query)
        self.assertEqual(query_results, mock_salesforce.return_value.query_all.return_value)

    @patch('airflow.contrib.hooks.salesforce_hook.Salesforce')
    def test_describe_object(self, mock_salesforce):
        obj = 'obj_name'
        mock_salesforce.return_value.__setattr__(obj, Mock(spec=Salesforce))
        self.salesforce_hook.conn = mock_salesforce.return_value

        obj_description = self.salesforce_hook.describe_object(obj)

        mock_salesforce.return_value.__getattr__(obj).describe.assert_called_once_with()
        self.assertEqual(obj_description, mock_salesforce.return_value.__getattr__(obj).describe.return_value)

    @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.get_conn')
    @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.describe_object',
           return_value={'fields': [{'name': 'field_1'}, {'name': 'field_2'}]})
    def test_get_available_fields(self, mock_describe_object, mock_get_conn):
        obj = 'obj_name'

        available_fields = self.salesforce_hook.get_available_fields(obj)

        mock_get_conn.assert_called_once_with()
        mock_describe_object.assert_called_once_with(obj)
        self.assertEqual(available_fields, ['field_1', 'field_2'])

    @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.make_query')
    def test_get_object_from_salesforce(self, mock_make_query):
        salesforce_objects = self.salesforce_hook.get_object_from_salesforce(obj='obj_name',
                                                                             fields=['field_1', 'field_2'])

        mock_make_query.assert_called_once_with("SELECT field_1,field_2 FROM obj_name")
        self.assertEqual(salesforce_objects, mock_make_query.return_value)

    def test_write_object_to_file_invalid_format(self):
        with self.assertRaises(ValueError):
            self.salesforce_hook.write_object_to_file(query_results=[], filename='test', fmt="test")

    @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records',
           return_value=pd.DataFrame({'test': [1, 2, 3]}))
    def test_write_object_to_file_csv(self, mock_data_frame):
        mock_data_frame.return_value.to_csv = Mock()
        filename = 'test'

        data_frame = self.salesforce_hook.write_object_to_file(query_results=[], filename=filename, fmt="csv")

        mock_data_frame.return_value.to_csv.assert_called_once_with(filename, index=False)
        pd.testing.assert_frame_equal(data_frame, pd.DataFrame({'test': [1, 2, 3]}))

    @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.describe_object',
           return_value={'fields': [{'name': 'field_1', 'type': 'date'}]})
    @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records',
           return_value=pd.DataFrame({
               'test': [1, 2, 3],
               'field_1': ['2019-01-01', '2019-01-02', '2019-01-03']
           }))
    def test_write_object_to_file_json_with_timestamp_conversion(self, mock_data_frame, mock_describe_object):
        mock_data_frame.return_value.to_json = Mock()
        filename = 'test'
        obj_name = 'obj_name'

        data_frame = self.salesforce_hook.write_object_to_file(
            query_results=[{'attributes': {'type': obj_name}}],
            filename=filename,
            fmt="json",
            coerce_to_timestamp=True
        )

        mock_describe_object.assert_called_once_with(obj_name)
        mock_data_frame.return_value.to_json.assert_called_once_with(filename, "records", date_unit="s")
        pd.testing.assert_frame_equal(data_frame, pd.DataFrame({
            'test': [1, 2, 3],
            'field_1': [1.546301e+09, 1.546387e+09, 1.546474e+09]
        }))

    @patch('airflow.contrib.hooks.salesforce_hook.time.time', return_value=1.23)
    @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records',
           return_value=pd.DataFrame({'test': [1, 2, 3]}))
    def test_write_object_to_file_ndjson_with_record_time(self, mock_data_frame, mock_time):
        mock_data_frame.return_value.to_json = Mock()
        filename = 'test'

        data_frame = self.salesforce_hook.write_object_to_file(
            query_results=[],
            filename=filename,
            fmt="ndjson",
            record_time_added=True
        )

        mock_data_frame.return_value.to_json.assert_called_once_with(
            filename,
            "records",
            lines=True,
            date_unit="s"
        )
        pd.testing.assert_frame_equal(data_frame, pd.DataFrame({
            'test': [1, 2, 3],
            'time_fetched_from_salesforce': [
                mock_time.return_value, mock_time.return_value, mock_time.return_value
            ]
        }))
Esempio n. 9
0
 def setUp(self):
     self.salesforce_hook = SalesforceHook(conn_id='conn_id')
Esempio n. 10
0
class TestSalesforceHook(unittest.TestCase):
    def setUp(self):
        self.salesforce_hook = SalesforceHook(conn_id='conn_id')

    def test_get_conn_exists(self):
        self.salesforce_hook.conn = Mock(spec=Salesforce)

        self.salesforce_hook.get_conn()

        self.assertIsNotNone(self.salesforce_hook.conn.return_value)

    @patch(
        'airflow.contrib.hooks.salesforce_hook.SalesforceHook.get_connection',
        return_value=Connection(
            login='******',
            password='******',
            extra='{"security_token": "token", "sandbox": "true"}'))
    @patch('airflow.contrib.hooks.salesforce_hook.Salesforce')
    def test_get_conn(self, mock_salesforce, mock_get_connection):
        self.salesforce_hook.get_conn()

        self.assertEqual(self.salesforce_hook.conn,
                         mock_salesforce.return_value)
        mock_salesforce.assert_called_once_with(
            username=mock_get_connection.return_value.login,
            password=mock_get_connection.return_value.password,
            security_token=mock_get_connection.return_value.
            extra_dejson['security_token'],
            instance_url=mock_get_connection.return_value.host,
            sandbox=mock_get_connection.return_value.extra_dejson.get(
                'sandbox', False))

    @patch('airflow.contrib.hooks.salesforce_hook.Salesforce')
    def test_make_query(self, mock_salesforce):
        mock_salesforce.return_value.query_all.return_value = dict(
            totalSize=123, done=True)
        self.salesforce_hook.conn = mock_salesforce.return_value
        query = 'SELECT * FROM table'

        query_results = self.salesforce_hook.make_query(query,
                                                        include_deleted=True)

        mock_salesforce.return_value.query_all.assert_called_once_with(
            query, include_deleted=True)
        self.assertEqual(query_results,
                         mock_salesforce.return_value.query_all.return_value)

    @patch('airflow.contrib.hooks.salesforce_hook.Salesforce')
    def test_describe_object(self, mock_salesforce):
        obj = 'obj_name'
        mock_salesforce.return_value.__setattr__(obj, Mock(spec=Salesforce))
        self.salesforce_hook.conn = mock_salesforce.return_value

        obj_description = self.salesforce_hook.describe_object(obj)

        mock_salesforce.return_value.__getattr__(
            obj).describe.assert_called_once_with()
        self.assertEqual(
            obj_description,
            mock_salesforce.return_value.__getattr__(
                obj).describe.return_value)

    @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.get_conn')
    @patch(
        'airflow.contrib.hooks.salesforce_hook.SalesforceHook.describe_object',
        return_value={'fields': [{
            'name': 'field_1'
        }, {
            'name': 'field_2'
        }]})
    def test_get_available_fields(self, mock_describe_object, mock_get_conn):
        obj = 'obj_name'

        available_fields = self.salesforce_hook.get_available_fields(obj)

        mock_get_conn.assert_called_once_with()
        mock_describe_object.assert_called_once_with(obj)
        self.assertEqual(available_fields, ['field_1', 'field_2'])

    @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.make_query')
    def test_get_object_from_salesforce(self, mock_make_query):
        salesforce_objects = self.salesforce_hook.get_object_from_salesforce(
            obj='obj_name', fields=['field_1', 'field_2'])

        mock_make_query.assert_called_once_with(
            "SELECT field_1,field_2 FROM obj_name")
        self.assertEqual(salesforce_objects, mock_make_query.return_value)

    def test_write_object_to_file_invalid_format(self):
        with self.assertRaises(ValueError):
            self.salesforce_hook.write_object_to_file(query_results=[],
                                                      filename='test',
                                                      fmt="test")

    @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records',
           return_value=pd.DataFrame({'test': [1, 2, 3]}))
    def test_write_object_to_file_csv(self, mock_data_frame):
        mock_data_frame.return_value.to_csv = Mock()
        filename = 'test'

        data_frame = self.salesforce_hook.write_object_to_file(
            query_results=[], filename=filename, fmt="csv")

        mock_data_frame.return_value.to_csv.assert_called_once_with(
            filename, index=False)
        pd.testing.assert_frame_equal(data_frame,
                                      pd.DataFrame({'test': [1, 2, 3]}))

    @patch(
        'airflow.contrib.hooks.salesforce_hook.SalesforceHook.describe_object',
        return_value={'fields': [{
            'name': 'field_1',
            'type': 'date'
        }]})
    @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records',
           return_value=pd.DataFrame({
               'test': [1, 2, 3],
               'field_1': ['2019-01-01', '2019-01-02', '2019-01-03']
           }))
    def test_write_object_to_file_json_with_timestamp_conversion(
            self, mock_data_frame, mock_describe_object):
        mock_data_frame.return_value.to_json = Mock()
        filename = 'test'
        obj_name = 'obj_name'

        data_frame = self.salesforce_hook.write_object_to_file(
            query_results=[{
                'attributes': {
                    'type': obj_name
                }
            }],
            filename=filename,
            fmt="json",
            coerce_to_timestamp=True)

        mock_describe_object.assert_called_once_with(obj_name)
        mock_data_frame.return_value.to_json.assert_called_once_with(
            filename, "records", date_unit="s")
        pd.testing.assert_frame_equal(
            data_frame,
            pd.DataFrame({
                'test': [1, 2, 3],
                'field_1': [1.546301e+09, 1.546387e+09, 1.546474e+09]
            }))

    @patch('airflow.contrib.hooks.salesforce_hook.time.time',
           return_value=1.23)
    @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records',
           return_value=pd.DataFrame({'test': [1, 2, 3]}))
    def test_write_object_to_file_ndjson_with_record_time(
            self, mock_data_frame, mock_time):
        mock_data_frame.return_value.to_json = Mock()
        filename = 'test'

        data_frame = self.salesforce_hook.write_object_to_file(
            query_results=[],
            filename=filename,
            fmt="ndjson",
            record_time_added=True)

        mock_data_frame.return_value.to_json.assert_called_once_with(
            filename, "records", lines=True, date_unit="s")
        pd.testing.assert_frame_equal(
            data_frame,
            pd.DataFrame({
                'test': [1, 2, 3],
                'time_fetched_from_salesforce': [
                    mock_time.return_value, mock_time.return_value,
                    mock_time.return_value
                ]
            }))
    def execute(self, context):

        with NamedTemporaryFile("w") as tmp:

            # Load the SalesforceHook
            hook = SalesforceHook(conn_id=self.sf_conn_id, output=tmp.name)

            # Attempt to login to Salesforce
            # If this process fails, it will raise an error and die.
            try:
                sf_conn = hook.sign_in()
            except:
                logging.debug('Unable to login.')

            logging.info(self.soql)
            logging.info(self.object)

            logging.debug('Connecting to Salesforce...')
            query_results = sf_conn.bulk.__getattr__(self.object).query(
                self.soql)
            logging.info('Retrieved results...')

            logging.info(type(query_results))
            logging.info('First line is:')
            logging.info(query_results[0])

            gcs = GoogleCloudStorageHook(self.gcs_conn_id)
            service = gcs.get_conn()

            logging.info('Preparing File...')

            intermediate_arr = []

            for i, q in enumerate(query_results):

                del q['attributes']
                q["partition_date"] = date.today().strftime('%Y-%m-%d')

                for k, v in q.items():

                    if (type(v) == float):
                        q[k] = round(v, 2)
                    if (type(v) == int) and (len(str(v)) == 13):
                        q[k] = datetime.fromtimestamp(
                            v / 1000).strftime('%Y-%m-%d %H:%M:%S')
                    if (type(v) == str) and (re.search(r"^(\d+\.\d+)$", v) !=
                                             None):
                        q[k] = round(float(v), 2)

                for key in q.keys():

                    q[key.lower()] = q.pop(key)

                query = json.dumps(q, ensure_ascii=False)
                intermediate_arr.append(query + '\n')
                del query

                if i % 100 == 0:
                    tmp.file.writelines(intermediate_arr)
                    intermediate_arr = []

                    #tmp.file.write(str(query+'\n'))
            tmp.file.writelines(intermediate_arr)

            #            tmp.file.flush()

            logging.info('Loading results to GCS...')

            self.upload(service=service,
                        bucket=self.gcs_bucket,
                        filename=tmp.name,
                        object=self.gcs_object,
                        multipart=True,
                        num_retries=2)

            tmp.close()

        logging.info("Query finished!")
def _get_sf_records(sf_hook: SalesforceHook, query: str) -> pd.DataFrame:
    query_result = sf_hook.make_query(query)
    records = [parse_sf_record(record) for record in query_result["records"]]

    df = pd.DataFrame.from_records(records)
    return df