예제 #1
0
    def test_create_pyspark_job_operator(self, create_pyspark_job_mock, *_):
        operator = DataprocCreatePysparkJobOperator(
            task_id='create_pyspark_job',
            main_python_file_uri=
            's3a://some-in-bucket/jobs/sources/pyspark-001/main.py',
            python_file_uris=[
                's3a://some-in-bucket/jobs/sources/pyspark-001/geonames.py',
            ],
            file_uris=[
                's3a://some-in-bucket/jobs/sources/data/config.json',
            ],
            archive_uris=[
                's3a://some-in-bucket/jobs/sources/data/country-codes.csv.zip',
            ],
            args=[
                's3a://some-in-bucket/jobs/sources/data/cities500.txt.bz2',
                's3a://some-out-bucket/jobs/results/${{JOB_ID}}',
            ],
            jar_file_uris=[
                's3a://some-in-bucket/jobs/sources/java/dataproc-examples-1.0.jar',
                's3a://some-in-bucket/jobs/sources/java/icu4j-61.1.jar',
                's3a://some-in-bucket/jobs/sources/java/commons-lang-2.6.jar',
            ],
            properties={
                'spark.submit.deployMode': 'cluster',
            },
        )
        context = {'task_instance': MagicMock()}
        context['task_instance'].xcom_pull.return_value = 'my_cluster_id'
        operator.execute(context)

        context['task_instance'].xcom_pull.assert_has_calls([
            call(key='cluster_id'),
            call(key='yandexcloud_connection_id'),
        ])

        create_pyspark_job_mock.assert_called_once_with(
            archive_uris=[
                's3a://some-in-bucket/jobs/sources/data/country-codes.csv.zip'
            ],
            args=[
                's3a://some-in-bucket/jobs/sources/data/cities500.txt.bz2',
                's3a://some-out-bucket/jobs/results/${{JOB_ID}}',
            ],
            cluster_id='my_cluster_id',
            file_uris=['s3a://some-in-bucket/jobs/sources/data/config.json'],
            jar_file_uris=[
                's3a://some-in-bucket/jobs/sources/java/dataproc-examples-1.0.jar',
                's3a://some-in-bucket/jobs/sources/java/icu4j-61.1.jar',
                's3a://some-in-bucket/jobs/sources/java/commons-lang-2.6.jar',
            ],
            main_python_file_uri=
            's3a://some-in-bucket/jobs/sources/pyspark-001/main.py',
            name='Pyspark job',
            properties={'spark.submit.deployMode': 'cluster'},
            python_file_uris=[
                's3a://some-in-bucket/jobs/sources/pyspark-001/geonames.py'
            ],
        )
예제 #2
0
    create_pyspark_job = DataprocCreatePysparkJobOperator(
        task_id='create_pyspark_job',
        main_python_file_uri=
        's3a://data-proc-public/jobs/sources/pyspark-001/main.py',
        python_file_uris=[
            's3a://data-proc-public/jobs/sources/pyspark-001/geonames.py',
        ],
        file_uris=[
            's3a://data-proc-public/jobs/sources/data/config.json',
        ],
        archive_uris=[
            's3a://data-proc-public/jobs/sources/data/country-codes.csv.zip',
        ],
        args=[
            's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2',
            f's3a://{S3_BUCKET_NAME_FOR_JOB_LOGS}/dataproc/job/results/${{JOB_ID}}',
        ],
        jar_file_uris=[
            's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar',
            's3a://data-proc-public/jobs/sources/java/icu4j-61.1.jar',
            's3a://data-proc-public/jobs/sources/java/commons-lang-2.6.jar',
        ],
        properties={
            'spark.submit.deployMode': 'cluster',
        },
        packages=['org.slf4j:slf4j-simple:1.7.30'],
        repositories=['https://repo1.maven.org/maven2'],
        exclude_packages=['com.amazonaws:amazon-kinesis-client'],
    )
        },
    )

    create_pyspark_job = DataprocCreatePysparkJobOperator(
        task_id='create_pyspark_job',
        main_python_file_uri=
        's3a://data-proc-public/jobs/sources/pyspark-001/main.py',
        python_file_uris=[
            's3a://data-proc-public/jobs/sources/pyspark-001/geonames.py',
        ],
        file_uris=[
            's3a://data-proc-public/jobs/sources/data/config.json',
        ],
        archive_uris=[
            's3a://data-proc-public/jobs/sources/data/country-codes.csv.zip',
        ],
        args=[
            's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2',
            f's3a://{S3_BUCKET_NAME_FOR_JOB_LOGS}/jobs/results/${{JOB_ID}}',
        ],
        jar_file_uris=[
            's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar',
            's3a://data-proc-public/jobs/sources/java/icu4j-61.1.jar',
            's3a://data-proc-public/jobs/sources/java/commons-lang-2.6.jar',
        ],
        properties={
            'spark.submit.deployMode': 'cluster',
        },
    )

    delete_cluster = DataprocDeleteClusterOperator(task_id='delete_cluster', )