예제 #1
0
    def test_save_as_csv(self, mock_presto_hook, mock_gcs_hook):
        def _assert_upload(bucket, obj, tmp_filename, mime_type, gzip):
            self.assertEqual(BUCKET, bucket)
            self.assertEqual(FILENAME.format(0), obj)
            self.assertEqual("text/csv", mime_type)
            self.assertFalse(gzip)
            with open(tmp_filename, "rb") as file:
                self.assertEqual(b"".join(CSV_LINES), file.read())

        mock_gcs_hook.return_value.upload.side_effect = _assert_upload

        mock_cursor = mock_presto_hook.return_value.get_conn.return_value.cursor

        mock_cursor.return_value.description = [
            ("some_num", "INTEGER", None, None, None, None, None),
            ("some_str", "VARCHAR", None, None, None, None, None),
        ]

        mock_cursor.return_value.fetchone.side_effect = [
            [42, "mock_row_content_1"],
            [43, "mock_row_content_2"],
            [44, "mock_row_content_3"],
            None,
        ]

        op = PrestoToGCSOperator(
            task_id=TASK_ID,
            sql=SQL,
            bucket=BUCKET,
            filename=FILENAME,
            export_format="csv",
            presto_conn_id=PRESTO_CONN_ID,
            gcp_conn_id=GCP_CONN_ID,
            impersonation_chain=IMPERSONATION_CHAIN,
        )

        op.execute(None)

        mock_gcs_hook.return_value.upload.assert_called()

        mock_presto_hook.assert_called_once_with(presto_conn_id=PRESTO_CONN_ID)
        mock_gcs_hook.assert_called_once_with(
            delegate_to=None,
            gcp_conn_id=GCP_CONN_ID,
            impersonation_chain=IMPERSONATION_CHAIN,
        )
예제 #2
0
    def test_save_as_csv_with_file_splitting(self, mock_gcs_hook,
                                             mock_presto_hook):
        """Test that csv is split by approx_max_file_size_bytes param."""

        expected_upload = {
            FILENAME.format(0): b"".join(CSV_LINES[:3]),
            FILENAME.format(1): b"".join([CSV_LINES[0], CSV_LINES[3]]),
        }

        def _assert_upload(bucket, obj, tmp_filename, mime_type, gzip):
            self.assertEqual(BUCKET, bucket)
            self.assertEqual("text/csv", mime_type)
            self.assertFalse(gzip)
            with open(tmp_filename, "rb") as file:
                self.assertEqual(expected_upload[obj], file.read())

        mock_gcs_hook.return_value.upload.side_effect = _assert_upload

        mock_cursor = mock_presto_hook.return_value.get_conn.return_value.cursor

        mock_cursor.return_value.description = [
            ("some_num", "INTEGER", None, None, None, None, None),
            ("some_str", "VARCHAR(20)", None, None, None, None, None),
        ]

        mock_cursor.return_value.fetchone.side_effect = [
            [42, "mock_row_content_1"],
            [43, "mock_row_content_2"],
            [44, "mock_row_content_3"],
            None,
        ]

        op = PrestoToGCSOperator(
            task_id=TASK_ID,
            sql=SQL,
            bucket=BUCKET,
            filename=FILENAME,
            approx_max_file_size_bytes=len(
                expected_upload[FILENAME.format(0)]),
            export_format="csv",
        )

        op.execute(None)

        mock_gcs_hook.return_value.upload.assert_called()
예제 #3
0
    def test_save_as_json_with_schema_file(self, mock_gcs_hook, mock_presto_hook):
        """Test writing schema files."""

        def _assert_upload(bucket, obj, tmp_filename, mime_type, gzip):  # pylint: disable=unused-argument
            if obj == SCHEMA_FILENAME:
                with open(tmp_filename, "rb") as file:
                    self.assertEqual(SCHEMA_JSON, file.read())

        mock_gcs_hook.return_value.upload.side_effect = _assert_upload

        mock_cursor = mock_presto_hook.return_value.get_conn.return_value.cursor

        mock_cursor.return_value.description = [
            ("some_num", "INTEGER", None, None, None, None, None),
            ("some_str", "VARCHAR", None, None, None, None, None),
        ]

        mock_cursor.return_value.fetchone.side_effect = [
            [42, "mock_row_content_1"],
            [43, "mock_row_content_2"],
            [44, "mock_row_content_3"],
            None,
        ]

        op = PrestoToGCSOperator(
            task_id=TASK_ID,
            sql=SQL,
            bucket=BUCKET,
            filename=FILENAME,
            schema_filename=SCHEMA_FILENAME,
            export_format="csv",
            presto_conn_id=PRESTO_CONN_ID,
            gcp_conn_id=GCP_CONN_ID,
        )
        op.execute(None)

        # once for the file and once for the schema
        self.assertEqual(2, mock_gcs_hook.return_value.upload.call_count)
예제 #4
0
 def test_init(self):
     """Test PrestoToGCSOperator instance is properly initialized."""
     op = PrestoToGCSOperator(
         task_id=TASK_ID,
         sql=SQL,
         bucket=BUCKET,
         filename=FILENAME,
         impersonation_chain=IMPERSONATION_CHAIN,
     )
     self.assertEqual(op.task_id, TASK_ID)
     self.assertEqual(op.sql, SQL)
     self.assertEqual(op.bucket, BUCKET)
     self.assertEqual(op.filename, FILENAME)
     self.assertEqual(op.impersonation_chain, IMPERSONATION_CHAIN)
예제 #5
0
    dag_id="example_presto_to_gcs",
    schedule_interval='@once',  # Override to match your needs
    start_date=days_ago(1),
    tags=["example"],
) as dag:

    create_dataset = BigQueryCreateEmptyDatasetOperator(task_id="create-dataset", dataset_id=DATASET_NAME)

    delete_dataset = BigQueryDeleteDatasetOperator(
        task_id="delete_dataset", dataset_id=DATASET_NAME, delete_contents=True
    )

    # [START howto_operator_presto_to_gcs_basic]
    presto_to_gcs_basic = PrestoToGCSOperator(
        task_id="presto_to_gcs_basic",
        sql=f"select * from {SOURCE_MULTIPLE_TYPES}",
        bucket=GCS_BUCKET,
        filename=f"{safe_name(SOURCE_MULTIPLE_TYPES)}.{{}}.json",
    )
    # [END howto_operator_presto_to_gcs_basic]

    # [START howto_operator_presto_to_gcs_multiple_types]
    presto_to_gcs_multiple_types = PrestoToGCSOperator(
        task_id="presto_to_gcs_multiple_types",
        sql=f"select * from {SOURCE_MULTIPLE_TYPES}",
        bucket=GCS_BUCKET,
        filename=f"{safe_name(SOURCE_MULTIPLE_TYPES)}.{{}}.json",
        schema_filename=f"{safe_name(SOURCE_MULTIPLE_TYPES)}-schema.json",
        gzip=False,
    )
    # [END howto_operator_presto_to_gcs_multiple_types]