def test_save_as_csv(self, mock_presto_hook, mock_gcs_hook): def _assert_upload(bucket, obj, tmp_filename, mime_type, gzip): self.assertEqual(BUCKET, bucket) self.assertEqual(FILENAME.format(0), obj) self.assertEqual("text/csv", mime_type) self.assertFalse(gzip) with open(tmp_filename, "rb") as file: self.assertEqual(b"".join(CSV_LINES), file.read()) mock_gcs_hook.return_value.upload.side_effect = _assert_upload mock_cursor = mock_presto_hook.return_value.get_conn.return_value.cursor mock_cursor.return_value.description = [ ("some_num", "INTEGER", None, None, None, None, None), ("some_str", "VARCHAR", None, None, None, None, None), ] mock_cursor.return_value.fetchone.side_effect = [ [42, "mock_row_content_1"], [43, "mock_row_content_2"], [44, "mock_row_content_3"], None, ] op = PrestoToGCSOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=FILENAME, export_format="csv", presto_conn_id=PRESTO_CONN_ID, gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN, ) op.execute(None) mock_gcs_hook.return_value.upload.assert_called() mock_presto_hook.assert_called_once_with(presto_conn_id=PRESTO_CONN_ID) mock_gcs_hook.assert_called_once_with( delegate_to=None, gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN, )
def test_save_as_csv_with_file_splitting(self, mock_gcs_hook, mock_presto_hook): """Test that csv is split by approx_max_file_size_bytes param.""" expected_upload = { FILENAME.format(0): b"".join(CSV_LINES[:3]), FILENAME.format(1): b"".join([CSV_LINES[0], CSV_LINES[3]]), } def _assert_upload(bucket, obj, tmp_filename, mime_type, gzip): self.assertEqual(BUCKET, bucket) self.assertEqual("text/csv", mime_type) self.assertFalse(gzip) with open(tmp_filename, "rb") as file: self.assertEqual(expected_upload[obj], file.read()) mock_gcs_hook.return_value.upload.side_effect = _assert_upload mock_cursor = mock_presto_hook.return_value.get_conn.return_value.cursor mock_cursor.return_value.description = [ ("some_num", "INTEGER", None, None, None, None, None), ("some_str", "VARCHAR(20)", None, None, None, None, None), ] mock_cursor.return_value.fetchone.side_effect = [ [42, "mock_row_content_1"], [43, "mock_row_content_2"], [44, "mock_row_content_3"], None, ] op = PrestoToGCSOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=FILENAME, approx_max_file_size_bytes=len( expected_upload[FILENAME.format(0)]), export_format="csv", ) op.execute(None) mock_gcs_hook.return_value.upload.assert_called()
def test_save_as_json_with_schema_file(self, mock_gcs_hook, mock_presto_hook): """Test writing schema files.""" def _assert_upload(bucket, obj, tmp_filename, mime_type, gzip): # pylint: disable=unused-argument if obj == SCHEMA_FILENAME: with open(tmp_filename, "rb") as file: self.assertEqual(SCHEMA_JSON, file.read()) mock_gcs_hook.return_value.upload.side_effect = _assert_upload mock_cursor = mock_presto_hook.return_value.get_conn.return_value.cursor mock_cursor.return_value.description = [ ("some_num", "INTEGER", None, None, None, None, None), ("some_str", "VARCHAR", None, None, None, None, None), ] mock_cursor.return_value.fetchone.side_effect = [ [42, "mock_row_content_1"], [43, "mock_row_content_2"], [44, "mock_row_content_3"], None, ] op = PrestoToGCSOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=FILENAME, schema_filename=SCHEMA_FILENAME, export_format="csv", presto_conn_id=PRESTO_CONN_ID, gcp_conn_id=GCP_CONN_ID, ) op.execute(None) # once for the file and once for the schema self.assertEqual(2, mock_gcs_hook.return_value.upload.call_count)
def test_init(self): """Test PrestoToGCSOperator instance is properly initialized.""" op = PrestoToGCSOperator( task_id=TASK_ID, sql=SQL, bucket=BUCKET, filename=FILENAME, impersonation_chain=IMPERSONATION_CHAIN, ) self.assertEqual(op.task_id, TASK_ID) self.assertEqual(op.sql, SQL) self.assertEqual(op.bucket, BUCKET) self.assertEqual(op.filename, FILENAME) self.assertEqual(op.impersonation_chain, IMPERSONATION_CHAIN)
dag_id="example_presto_to_gcs", schedule_interval='@once', # Override to match your needs start_date=days_ago(1), tags=["example"], ) as dag: create_dataset = BigQueryCreateEmptyDatasetOperator(task_id="create-dataset", dataset_id=DATASET_NAME) delete_dataset = BigQueryDeleteDatasetOperator( task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True ) # [START howto_operator_presto_to_gcs_basic] presto_to_gcs_basic = PrestoToGCSOperator( task_id="presto_to_gcs_basic", sql=f"select * from {SOURCE_MULTIPLE_TYPES}", bucket=GCS_BUCKET, filename=f"{safe_name(SOURCE_MULTIPLE_TYPES)}.{{}}.json", ) # [END howto_operator_presto_to_gcs_basic] # [START howto_operator_presto_to_gcs_multiple_types] presto_to_gcs_multiple_types = PrestoToGCSOperator( task_id="presto_to_gcs_multiple_types", sql=f"select * from {SOURCE_MULTIPLE_TYPES}", bucket=GCS_BUCKET, filename=f"{safe_name(SOURCE_MULTIPLE_TYPES)}.{{}}.json", schema_filename=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json", gzip=False, ) # [END howto_operator_presto_to_gcs_multiple_types]