def get_sf_object(self, sf_fields, sf_conn_id, sf_object): # Sign into Salesforce sf_conn = SalesforceHook(conn_id=sf_conn_id) sf_conn.sign_in() fields = [field['sf_name'] for field in sf_fields] logging.info("Making request for {0} fields from {1}".format( len(fields), sf_object)) query = sf_conn.get_object_from_salesforce(sf_object, fields) return query
def get_sf_object_cols(self, sf_conn_id, sf_object, sf_fields): """ Uses Salesforce describe() method to fetch columns from Salesforce instance. Compound columns are filtered out. """ sf_conn = SalesforceHook(sf_conn_id).sign_in() logging.info("Signing in successfully. Fetching SFDC columns") # Dynamically Fetch the simple_salesforce query method # ie. sf_conn.Lead.describe() | sf_conn.Contact.describe() fields = sf_conn.__getattr__(sf_object).describe()['fields'] # Get compound fields & Links k1 = 'compoundFieldName' compound_fields = [f[k1] for f in fields ] # Get all compound fields across all fields compound_fields = set(compound_fields) compound_fields.remove(None) def build_dict(x): return { 'rs_name': x['name'].lower(), 'sf_name': x['name'], 'path': [x['name']], 'type': x['soapType'].split(':')[-1], 'length': x['length'], 'precision': x['precision'] } # Loop through fields and grab columns we want if sf_fields: fields = [ build_dict(field) for field in fields if field['name'] in sf_fields if field['name'] not in compound_fields ] else: fields = [ build_dict(field) for field in fields if field['name'] not in compound_fields ] dummy = [name['sf_name'] for name in fields] logging.info(dummy) return fields
def airflow_extract_data( cms_info, drive_credentials, start_date, get_member_xcom_args, execution_date, **kwargs, ) -> List[str]: """Extracts data from Salesforce that was modified between the start date and the execution date. """ member_id = kwargs["task_instance"].xcom_pull(**get_member_xcom_args) CONN_ID = cms_info["connection_id"] QUERIES = cms_info["queries"] sf_hook: SalesforceHook = SalesforceHook(conn_id=CONN_ID) docs_service = drive.get_google_docs_service(drive_credentials) logging.info("execution_date") logging.info(execution_date) start_datetime, end_datetime = dates.airflow_get_date_range( member_id, start_date, execution_date) logging.info("Calculated start_datetime") logging.info(start_datetime) logging.info("Calculated end_datetime") logging.info(end_datetime) filenames: List[str] = [] for document_id in QUERIES: # Load the query and replace the dates query: str = drive.load_doc_as_query(docs_service, document_id) template = jinja2.Template(query) query_with_dates: str = template.render( start_datetime=f"{start_datetime}Z", end_datetime=f"{end_datetime}Z") logging.info("Query with populated WHERE clause:") logging.info(query_with_dates) # Get the data and write it to a .csv records_dataframe: pd.DataFrame = _get_sf_records( sf_hook, query_with_dates) if not records_dataframe.empty: tf = tempfile.NamedTemporaryFile(delete=False) records_dataframe.to_csv(tf.name, index=False) filenames.append(tf.name) return filenames
def execute(self, context): sf_conn = SalesforceHook(self.sf_conn_id).get_conn() logging.info(self.soql) query_results = sf_conn.bulk.__getattr__(self.object).query(self.soql) s3 = S3Hook(self.s3_conn_id) # One JSON Object Per Line query_results = [json.dumps(result, ensure_ascii=False) for result in query_results] query_results = '\n'.join(query_results) s3.load_string(query_results, self.s3_key, bucket_name=self.s3_bucket, replace=True)
def execute(self, context): """ Execute the operator. This will get all the data for a particular Salesforce model and write it to a file. """ logging.info("Prepping to gather data from Salesforce") # load the SalesforceHook # this is what has all the logic for # conencting and getting data from Salesforce hook = SalesforceHook(conn_id=self.conn_id, output=self.output) # attempt to login to Salesforce # if this process fails, it will raise an error and die right here # we could wrap it hook.sign_in() # get object from salesforce # if fields were not defined, # then we assume that the user wants to get all of them if not self.fields: self.fields = hook.get_available_fields(self.object) logging.info("Making request for" "{0} fields from {1}".format(len(self.fields), self.object)) if self.query: query = self.special_query( self.query, hook, relationship_object=self.relationship_object) else: query = hook.get_object_from_salesforce(self.object, self.fields) # output the records from the query to a file # the list of records is stored under the "records" key logging.info("Writing query results to: {0}".format(self.output)) hook.write_object_to_file(query['records'], filename=self.output, fmt=self.fmt, coerce_to_timestamp=self.coerce_to_timestamp, record_time_added=self.record_time_added) logging.info("Query finished!")
def execute(self, context): """ Execute the operator. This will get all the data for a particular Salesforce model and write it to a file. """ logging.info("Prepping to gather data from Salesforce") # Open a name temporary file to store output file until S3 upload with NamedTemporaryFile("w") as tmp: # Load the SalesforceHook hook = SalesforceHook(conn_id=self.sf_conn_id, output=tmp.name) # Attempt to login to Salesforce # If this process fails, it will raise an error and die. try: hook.sign_in() except: logging.debug('Unable to login.') # Get object from Salesforce # If fields were not defined, all fields are pulled. if not self.fields: self.fields = hook.get_available_fields(self.object) logging.info("Making request for " "{0} fields from {1}".format(len(self.fields), self.object)) if self.query: query = self.special_query( self.query, hook, relationship_object=self.relationship_object) else: query = hook.get_object_from_salesforce( self.object, self.fields) # output the records from the query to a file # the list of records is stored under the "records" key logging.info("Writing query results to: {0}".format(tmp.name)) hook.write_object_to_file( query['records'], filename=tmp.name, fmt=self.fmt, coerce_to_timestamp=self.coerce_to_timestamp, record_time_added=self.record_time_added) # Flush the temp file and upload temp file to S3 tmp.flush() dest_s3 = S3Hook(s3_conn_id=self.s3_conn_id) dest_s3.load_file(filename=tmp.name, key=self.output, bucket_name=self.s3_bucket, replace=True) dest_s3.connection.close() tmp.close() logging.info("Query finished!")
def setUp(self): self.salesforce_hook = SalesforceHook(conn_id='conn_id')
class TestSalesforceHook(unittest.TestCase): def setUp(self): self.salesforce_hook = SalesforceHook(conn_id='conn_id') def test_get_conn_exists(self): self.salesforce_hook.conn = Mock(spec=Salesforce) self.salesforce_hook.get_conn() self.assertIsNotNone(self.salesforce_hook.conn.return_value) @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.get_connection', return_value=Connection( login='******', password='******', extra='{"security_token": "token", "sandbox": "true"}' )) @patch('airflow.contrib.hooks.salesforce_hook.Salesforce') def test_get_conn(self, mock_salesforce, mock_get_connection): self.salesforce_hook.get_conn() self.assertEqual(self.salesforce_hook.conn, mock_salesforce.return_value) mock_salesforce.assert_called_once_with( username=mock_get_connection.return_value.login, password=mock_get_connection.return_value.password, security_token=mock_get_connection.return_value.extra_dejson['security_token'], instance_url=mock_get_connection.return_value.host, sandbox=mock_get_connection.return_value.extra_dejson.get('sandbox', False) ) @patch('airflow.contrib.hooks.salesforce_hook.Salesforce') def test_make_query(self, mock_salesforce): mock_salesforce.return_value.query_all.return_value = dict(totalSize=123, done=True) self.salesforce_hook.conn = mock_salesforce.return_value query = 'SELECT * FROM table' query_results = self.salesforce_hook.make_query(query) mock_salesforce.return_value.query_all.assert_called_once_with(query) self.assertEqual(query_results, mock_salesforce.return_value.query_all.return_value) @patch('airflow.contrib.hooks.salesforce_hook.Salesforce') def test_describe_object(self, mock_salesforce): obj = 'obj_name' mock_salesforce.return_value.__setattr__(obj, Mock(spec=Salesforce)) self.salesforce_hook.conn = mock_salesforce.return_value obj_description = self.salesforce_hook.describe_object(obj) mock_salesforce.return_value.__getattr__(obj).describe.assert_called_once_with() self.assertEqual(obj_description, mock_salesforce.return_value.__getattr__(obj).describe.return_value) @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.get_conn') @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.describe_object', return_value={'fields': [{'name': 'field_1'}, {'name': 'field_2'}]}) def test_get_available_fields(self, mock_describe_object, mock_get_conn): obj = 'obj_name' available_fields = self.salesforce_hook.get_available_fields(obj) mock_get_conn.assert_called_once_with() mock_describe_object.assert_called_once_with(obj) self.assertEqual(available_fields, ['field_1', 'field_2']) @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.make_query') def test_get_object_from_salesforce(self, mock_make_query): salesforce_objects = self.salesforce_hook.get_object_from_salesforce(obj='obj_name', fields=['field_1', 'field_2']) mock_make_query.assert_called_once_with("SELECT field_1,field_2 FROM obj_name") self.assertEqual(salesforce_objects, mock_make_query.return_value) def test_write_object_to_file_invalid_format(self): with self.assertRaises(ValueError): self.salesforce_hook.write_object_to_file(query_results=[], filename='test', fmt="test") @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records', return_value=pd.DataFrame({'test': [1, 2, 3]})) def test_write_object_to_file_csv(self, mock_data_frame): mock_data_frame.return_value.to_csv = Mock() filename = 'test' data_frame = self.salesforce_hook.write_object_to_file(query_results=[], filename=filename, fmt="csv") mock_data_frame.return_value.to_csv.assert_called_once_with(filename, index=False) pd.testing.assert_frame_equal(data_frame, pd.DataFrame({'test': [1, 2, 3]})) @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.describe_object', return_value={'fields': [{'name': 'field_1', 'type': 'date'}]}) @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records', return_value=pd.DataFrame({ 'test': [1, 2, 3], 'field_1': ['2019-01-01', '2019-01-02', '2019-01-03'] })) def test_write_object_to_file_json_with_timestamp_conversion(self, mock_data_frame, mock_describe_object): mock_data_frame.return_value.to_json = Mock() filename = 'test' obj_name = 'obj_name' data_frame = self.salesforce_hook.write_object_to_file( query_results=[{'attributes': {'type': obj_name}}], filename=filename, fmt="json", coerce_to_timestamp=True ) mock_describe_object.assert_called_once_with(obj_name) mock_data_frame.return_value.to_json.assert_called_once_with(filename, "records", date_unit="s") pd.testing.assert_frame_equal(data_frame, pd.DataFrame({ 'test': [1, 2, 3], 'field_1': [1.546301e+09, 1.546387e+09, 1.546474e+09] })) @patch('airflow.contrib.hooks.salesforce_hook.time.time', return_value=1.23) @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records', return_value=pd.DataFrame({'test': [1, 2, 3]})) def test_write_object_to_file_ndjson_with_record_time(self, mock_data_frame, mock_time): mock_data_frame.return_value.to_json = Mock() filename = 'test' data_frame = self.salesforce_hook.write_object_to_file( query_results=[], filename=filename, fmt="ndjson", record_time_added=True ) mock_data_frame.return_value.to_json.assert_called_once_with( filename, "records", lines=True, date_unit="s" ) pd.testing.assert_frame_equal(data_frame, pd.DataFrame({ 'test': [1, 2, 3], 'time_fetched_from_salesforce': [ mock_time.return_value, mock_time.return_value, mock_time.return_value ] }))
class TestSalesforceHook(unittest.TestCase): def setUp(self): self.salesforce_hook = SalesforceHook(conn_id='conn_id') def test_get_conn_exists(self): self.salesforce_hook.conn = Mock(spec=Salesforce) self.salesforce_hook.get_conn() self.assertIsNotNone(self.salesforce_hook.conn.return_value) @patch( 'airflow.contrib.hooks.salesforce_hook.SalesforceHook.get_connection', return_value=Connection( login='******', password='******', extra='{"security_token": "token", "sandbox": "true"}')) @patch('airflow.contrib.hooks.salesforce_hook.Salesforce') def test_get_conn(self, mock_salesforce, mock_get_connection): self.salesforce_hook.get_conn() self.assertEqual(self.salesforce_hook.conn, mock_salesforce.return_value) mock_salesforce.assert_called_once_with( username=mock_get_connection.return_value.login, password=mock_get_connection.return_value.password, security_token=mock_get_connection.return_value. extra_dejson['security_token'], instance_url=mock_get_connection.return_value.host, sandbox=mock_get_connection.return_value.extra_dejson.get( 'sandbox', False)) @patch('airflow.contrib.hooks.salesforce_hook.Salesforce') def test_make_query(self, mock_salesforce): mock_salesforce.return_value.query_all.return_value = dict( totalSize=123, done=True) self.salesforce_hook.conn = mock_salesforce.return_value query = 'SELECT * FROM table' query_results = self.salesforce_hook.make_query(query, include_deleted=True) mock_salesforce.return_value.query_all.assert_called_once_with( query, include_deleted=True) self.assertEqual(query_results, mock_salesforce.return_value.query_all.return_value) @patch('airflow.contrib.hooks.salesforce_hook.Salesforce') def test_describe_object(self, mock_salesforce): obj = 'obj_name' mock_salesforce.return_value.__setattr__(obj, Mock(spec=Salesforce)) self.salesforce_hook.conn = mock_salesforce.return_value obj_description = self.salesforce_hook.describe_object(obj) mock_salesforce.return_value.__getattr__( obj).describe.assert_called_once_with() self.assertEqual( obj_description, mock_salesforce.return_value.__getattr__( obj).describe.return_value) @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.get_conn') @patch( 'airflow.contrib.hooks.salesforce_hook.SalesforceHook.describe_object', return_value={'fields': [{ 'name': 'field_1' }, { 'name': 'field_2' }]}) def test_get_available_fields(self, mock_describe_object, mock_get_conn): obj = 'obj_name' available_fields = self.salesforce_hook.get_available_fields(obj) mock_get_conn.assert_called_once_with() mock_describe_object.assert_called_once_with(obj) self.assertEqual(available_fields, ['field_1', 'field_2']) @patch('airflow.contrib.hooks.salesforce_hook.SalesforceHook.make_query') def test_get_object_from_salesforce(self, mock_make_query): salesforce_objects = self.salesforce_hook.get_object_from_salesforce( obj='obj_name', fields=['field_1', 'field_2']) mock_make_query.assert_called_once_with( "SELECT field_1,field_2 FROM obj_name") self.assertEqual(salesforce_objects, mock_make_query.return_value) def test_write_object_to_file_invalid_format(self): with self.assertRaises(ValueError): self.salesforce_hook.write_object_to_file(query_results=[], filename='test', fmt="test") @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records', return_value=pd.DataFrame({'test': [1, 2, 3]})) def test_write_object_to_file_csv(self, mock_data_frame): mock_data_frame.return_value.to_csv = Mock() filename = 'test' data_frame = self.salesforce_hook.write_object_to_file( query_results=[], filename=filename, fmt="csv") mock_data_frame.return_value.to_csv.assert_called_once_with( filename, index=False) pd.testing.assert_frame_equal(data_frame, pd.DataFrame({'test': [1, 2, 3]})) @patch( 'airflow.contrib.hooks.salesforce_hook.SalesforceHook.describe_object', return_value={'fields': [{ 'name': 'field_1', 'type': 'date' }]}) @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records', return_value=pd.DataFrame({ 'test': [1, 2, 3], 'field_1': ['2019-01-01', '2019-01-02', '2019-01-03'] })) def test_write_object_to_file_json_with_timestamp_conversion( self, mock_data_frame, mock_describe_object): mock_data_frame.return_value.to_json = Mock() filename = 'test' obj_name = 'obj_name' data_frame = self.salesforce_hook.write_object_to_file( query_results=[{ 'attributes': { 'type': obj_name } }], filename=filename, fmt="json", coerce_to_timestamp=True) mock_describe_object.assert_called_once_with(obj_name) mock_data_frame.return_value.to_json.assert_called_once_with( filename, "records", date_unit="s") pd.testing.assert_frame_equal( data_frame, pd.DataFrame({ 'test': [1, 2, 3], 'field_1': [1.546301e+09, 1.546387e+09, 1.546474e+09] })) @patch('airflow.contrib.hooks.salesforce_hook.time.time', return_value=1.23) @patch('airflow.contrib.hooks.salesforce_hook.pd.DataFrame.from_records', return_value=pd.DataFrame({'test': [1, 2, 3]})) def test_write_object_to_file_ndjson_with_record_time( self, mock_data_frame, mock_time): mock_data_frame.return_value.to_json = Mock() filename = 'test' data_frame = self.salesforce_hook.write_object_to_file( query_results=[], filename=filename, fmt="ndjson", record_time_added=True) mock_data_frame.return_value.to_json.assert_called_once_with( filename, "records", lines=True, date_unit="s") pd.testing.assert_frame_equal( data_frame, pd.DataFrame({ 'test': [1, 2, 3], 'time_fetched_from_salesforce': [ mock_time.return_value, mock_time.return_value, mock_time.return_value ] }))
def execute(self, context): with NamedTemporaryFile("w") as tmp: # Load the SalesforceHook hook = SalesforceHook(conn_id=self.sf_conn_id, output=tmp.name) # Attempt to login to Salesforce # If this process fails, it will raise an error and die. try: sf_conn = hook.sign_in() except: logging.debug('Unable to login.') logging.info(self.soql) logging.info(self.object) logging.debug('Connecting to Salesforce...') query_results = sf_conn.bulk.__getattr__(self.object).query( self.soql) logging.info('Retrieved results...') logging.info(type(query_results)) logging.info('First line is:') logging.info(query_results[0]) gcs = GoogleCloudStorageHook(self.gcs_conn_id) service = gcs.get_conn() logging.info('Preparing File...') intermediate_arr = [] for i, q in enumerate(query_results): del q['attributes'] q["partition_date"] = date.today().strftime('%Y-%m-%d') for k, v in q.items(): if (type(v) == float): q[k] = round(v, 2) if (type(v) == int) and (len(str(v)) == 13): q[k] = datetime.fromtimestamp( v / 1000).strftime('%Y-%m-%d %H:%M:%S') if (type(v) == str) and (re.search(r"^(\d+\.\d+)$", v) != None): q[k] = round(float(v), 2) for key in q.keys(): q[key.lower()] = q.pop(key) query = json.dumps(q, ensure_ascii=False) intermediate_arr.append(query + '\n') del query if i % 100 == 0: tmp.file.writelines(intermediate_arr) intermediate_arr = [] #tmp.file.write(str(query+'\n')) tmp.file.writelines(intermediate_arr) # tmp.file.flush() logging.info('Loading results to GCS...') self.upload(service=service, bucket=self.gcs_bucket, filename=tmp.name, object=self.gcs_object, multipart=True, num_retries=2) tmp.close() logging.info("Query finished!")
def _get_sf_records(sf_hook: SalesforceHook, query: str) -> pd.DataFrame: query_result = sf_hook.make_query(query) records = [parse_sf_record(record) for record in query_result["records"]] df = pd.DataFrame.from_records(records) return df