Example #1
0
def run_spark_job(sdk, cluster_id, bucket):
    print('Running Spark job {}'.format(cluster_id))
    operation = sdk.client(job_service_grpc_pb.JobServiceStub).Create(
        job_service_pb.CreateJobRequest(
            cluster_id=cluster_id,
            name=
            'Spark job: Find total urban population in distribution by country',
            spark_job=job_pb.SparkJob(
                main_jar_file_uri=
                's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar',
                main_class=
                'ru.yandex.cloud.dataproc.examples.PopulationSparkJob',
                file_uris=[
                    's3a://data-proc-public/jobs/sources/data/config.json',
                ],
                archive_uris=[
                    's3a://data-proc-public/jobs/sources/data/country-codes.csv.zip',
                ],
                jar_file_uris=[
                    's3a://data-proc-public/jobs/sources/java/icu4j-61.1.jar',
                    's3a://data-proc-public/jobs/sources/java/commons-lang-2.6.jar',
                    's3a://data-proc-public/jobs/sources/java/opencsv-4.1.jar',
                    's3a://data-proc-public/jobs/sources/java/json-20190722.jar'
                ],
                args=[
                    's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2',
                    's3a://{bucket}/dataproc/job/results/${{JOB_ID}}'.format(
                        bucket=bucket),
                ],
                properties={
                    'spark.submit.deployMode': 'cluster',
                },
            )))
    wait_for_operation(sdk, operation)
    return operation
Example #2
0
def run_pyspark_job(sdk, cluster_id, bucket):
    print('Running Pyspark job {}'.format(cluster_id))
    operation = sdk.client(job_service_grpc_pb.JobServiceStub).Create(
        job_service_pb.CreateJobRequest(
            cluster_id=cluster_id,
            name='Pyspark job',
            pyspark_job=job_pb.PysparkJob(
                main_python_file_uri=
                's3a://data-proc-public/jobs/sources/pyspark-001/main.py',
                python_file_uris=[
                    's3a://data-proc-public/jobs/sources/pyspark-001/geonames.py',
                ],
                file_uris=[
                    's3a://data-proc-public/jobs/sources/data/config.json',
                ],
                archive_uris=[
                    's3a://data-proc-public/jobs/sources/data/country-codes.csv.zip',
                ],
                args=[
                    's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2',
                    's3a://{bucket}/jobs/results/${{JOB_ID}}'.format(
                        bucket=bucket),
                ],
                jar_file_uris=[
                    's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar',
                    's3a://data-proc-public/jobs/sources/java/icu4j-61.1.jar',
                    's3a://data-proc-public/jobs/sources/java/commons-lang-2.6.jar',
                ],
                properties={
                    'spark.submit.deployMode': 'cluster',
                },
            )))
    wait_for_operation(sdk, operation)
    return operation
Example #3
0
    def create_spark_job(
        self,
        main_jar_file_uri=None,
        main_class=None,
        file_uris=None,
        archive_uris=None,
        jar_file_uris=None,
        args=None,
        properties=None,
        cluster_id=None,
        name='Spark job',
    ):
        """
        Run Spark job in Yandex.Cloud Data Proc cluster.

        :param main_jar_file_uri: URI of jar file with job. Can be placed in HDFS or S3.
        :type main_class: str
        :param main_class: Name of the main class of the job.
        :type main_class: str
        :param file_uris: URIs of files used in the job. Can be placed in HDFS or S3.
        :type file_uris: List[str]
        :param archive_uris: URIs of archive files used in the job. Can be placed in HDFS or S3.
        :type archive_uris: List[str]
        :param jar_file_uris: URIs of JAR files used in the job. Can be placed in HDFS or S3.
        :type archive_uris: List[str]
        :param properties: Properties for the job.
        :type properties: Dist[str, str]
        :param args: Arguments to be passed to the job.
        :type args: List[str]
        :param cluster_id: ID of the cluster to run job in.
                           Will try to take the ID from Dataproc Hook object if ot specified.
        :type cluster_id: str
        :param name: Name of the job. Used for labeling.
        :type name: str
        """
        cluster_id = cluster_id or self.cluster_id
        if not cluster_id:
            raise RuntimeError('Cluster id must be specified.')
        self.log.info('Running Spark job. Cluster ID: {cluster_id}'.format(
            cluster_id=cluster_id))

        request = job_service_pb.CreateJobRequest(
            cluster_id=cluster_id,
            name=name,
            spark_job=job_pb.SparkJob(
                main_jar_file_uri=main_jar_file_uri,
                main_class=main_class,
                file_uris=file_uris,
                archive_uris=archive_uris,
                jar_file_uris=jar_file_uris,
                args=args,
                properties=properties,
            ))
        return self.sdk.create_operation_and_get_result(
            request,
            service=job_service_grpc_pb.JobServiceStub,
            method_name='Create',
            response_type=job_pb.Job,
            meta_type=job_service_pb.CreateJobMetadata,
        )
Example #4
0
def run_mapreduce_job(sdk, cluster_id, bucket):
    print('Running Mapreduce job {}'.format(cluster_id))
    operation = sdk.client(job_service_grpc_pb.JobServiceStub).Create(
        job_service_pb.CreateJobRequest(
            cluster_id=cluster_id,
            name='Mapreduce job 1',
            mapreduce_job=job_pb.MapreduceJob(
                main_class='org.apache.hadoop.streaming.HadoopStreaming',
                file_uris=[
                    's3a://data-proc-public/jobs/sources/mapreduce-001/mapper.py',
                    's3a://data-proc-public/jobs/sources/mapreduce-001/reducer.py'
                ],
                args=[
                    '-mapper', 'mapper.py', '-reducer', 'reducer.py',
                    '-numReduceTasks', '1', '-input',
                    's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2',
                    '-output',
                    's3a://{bucket}/dataproc/job/results'.format(bucket=bucket)
                ],
                properties={
                    'yarn.app.mapreduce.am.resource.mb': '2048',
                    'yarn.app.mapreduce.am.command-opts': '-Xmx2048m',
                    'mapreduce.job.maps': '6',
                },
            )))
    wait_for_operation(sdk, operation)
    return operation
Example #5
0
def run_hive_job(self, cluster_id):
    print('Running Hive job {}'.format(cluster_id))
    operation = self.client(job_service_grpc_pb.JobServiceStub).Create(
        job_service_pb.CreateJobRequest(
            cluster_id=cluster_id,
            name='Hive job 1',
            hive_job=job_pb.HiveJob(
                query_file_uri=
                's3a://data-proc-public/jobs/sources/hive-001/main.sql',
                script_variables={
                    'CITIES_URI':
                    's3a://data-proc-public/jobs/sources/hive-001/cities/',
                    'COUNTRY_CODE': 'RU',
                })))
    wait_for_operation(self, operation)
    return operation
Example #6
0
def run_hive_job(sdk, cluster_id):
    logging.info('Running Hive job {}'.format(cluster_id))
    operation = sdk.client(job_service_grpc_pb.JobServiceStub).Create(
        job_service_pb.CreateJobRequest(
            cluster_id=cluster_id,
            name='Hive job 1',
            hive_job=job_pb.HiveJob(
                query_file_uri='s3a://data-proc-public/jobs/sources/hive-001/main.sql',
                script_variables={
                    'CITIES_URI': 's3a://data-proc-public/jobs/sources/hive-001/cities/',
                    'COUNTRY_CODE': 'RU',
                }
            )

        )
    )
    return sdk.wait_operation_and_get_result(operation, response_type=job_pb.Job, meta_type=job_service_pb.CreateJobMetadata)
Example #7
0
def run_spark_job(sdk, cluster_id, bucket):
    logging.info('Running Spark job {}'.format(cluster_id))
    operation = sdk.client(job_service_grpc_pb.JobServiceStub).Create(
        job_service_pb.CreateJobRequest(
            cluster_id=cluster_id,
            name=
            'Spark job: Find total urban population in distribution by country',
            spark_job=job_pb.SparkJob(
                main_jar_file_uri=
                's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar',
                main_class=
                'ru.yandex.cloud.dataproc.examples.PopulationSparkJob',
                file_uris=[
                    's3a://data-proc-public/jobs/sources/data/config.json',
                ],
                archive_uris=[
                    's3a://data-proc-public/jobs/sources/data/country-codes.csv.zip',
                ],
                jar_file_uris=[
                    's3a://data-proc-public/jobs/sources/java/icu4j-61.1.jar',
                    's3a://data-proc-public/jobs/sources/java/commons-lang-2.6.jar',
                    's3a://data-proc-public/jobs/sources/java/opencsv-4.1.jar',
                    's3a://data-proc-public/jobs/sources/java/json-20190722.jar'
                ],
                args=[
                    's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2',
                    's3a://{bucket}/dataproc/job/results/${{JOB_ID}}'.format(
                        bucket=bucket),
                ],
                properties={
                    'spark.submit.deployMode': 'cluster',
                },
                packages=['org.slf4j:slf4j-simple:1.7.30'],
                repositories=['https://repo1.maven.org/maven2'],
                exclude_packages=['com.amazonaws:amazon-kinesis-client'],
            )))
    return sdk.wait_operation_and_get_result(
        operation,
        response_type=job_pb.Job,
        meta_type=job_service_pb.CreateJobMetadata)
Example #8
0
    def create_hive_job(
        self,
        query=None,
        query_file_uri=None,
        script_variables=None,
        continue_on_failure=False,
        properties=None,
        cluster_id=None,
        name='Hive job',
    ):
        """
        Run Hive job in Yandex.Cloud Data Proc cluster.

        :param query: Hive query.
        :type query: str
        :param query_file_uri: URI of the script that contains Hive queries. Can be placed in HDFS or S3.
        :type query_file_uri: str
        :param properties: A mapping of property names to values, used to configure Hive.
        :type properties: Dist[str, str]
        :param script_variables: Mapping of query variable names to values.
        :type script_variables: Dist[str, str]
        :param continue_on_failure: Whether to continue executing queries if a query fails.
        :type continue_on_failure: boole
        :param cluster_id: ID of the cluster to run job in.
                           Will try to take the ID from Dataproc Hook object if ot specified.
        :type cluster_id: str
        :param name: Name of the job. Used for labeling.
        :type name: str
        """
        cluster_id = cluster_id or self.cluster_id
        if not cluster_id:
            raise RuntimeError('Cluster id must be specified.')
        if (query and query_file_uri) or not (query or query_file_uri):
            raise RuntimeError(
                'Either query or query_file_uri must be specified.')
        self.log.info('Running Hive job. Cluster ID: {cluster_id}'.format(
            cluster_id=cluster_id))

        hive_job = job_pb.HiveJob(
            query_file_uri=query_file_uri,
            script_variables=script_variables,
            continue_on_failure=continue_on_failure,
            properties=properties,
        )
        if query:
            hive_job = job_pb.HiveJob(
                query_list=job_pb.QueryList(queries=query.split('\n')),
                script_variables=script_variables,
                continue_on_failure=continue_on_failure,
                properties=properties,
            )
        request = job_service_pb.CreateJobRequest(
            cluster_id=cluster_id,
            name=name,
            hive_job=hive_job,
        )
        return self.sdk.create_operation_and_get_result(
            request,
            service=job_service_grpc_pb.JobServiceStub,
            method_name='Create',
            response_type=job_pb.Job,
            meta_type=job_service_pb.CreateJobMetadata,
        )
Example #9
0
    def create_pyspark_job(
        self,
        main_python_file_uri=None,
        python_file_uris=None,
        file_uris=None,
        archive_uris=None,
        jar_file_uris=None,
        args=None,
        properties=None,
        cluster_id=None,
        name="Pyspark job",
        packages=None,
        repositories=None,
        exclude_packages=None,
    ):
        """
        Run Pyspark job in Yandex.Cloud Data Proc cluster.

        :param main_python_file_uri: URI of python file with job. Can be placed in HDFS or S3.
        :type main_python_file_uri: str
        :param python_file_uris: URIs of python files used in the job. Can be placed in HDFS or S3.
        :type python_file_uris: List[str]
        :param file_uris: URIs of files used in the job. Can be placed in HDFS or S3.
        :type file_uris: List[str]
        :param archive_uris: URIs of archive files used in the job. Can be placed in HDFS or S3.
        :type archive_uris: List[str]
        :param jar_file_uris: URIs of JAR files used in the job. Can be placed in HDFS or S3.
        :type archive_uris: List[str]
        :param properties: Properties for the job.
        :type properties: Dist[str, str]
        :param args: Arguments to be passed to the job.
        :type args: List[str]
        :param cluster_id: ID of the cluster to run job in.
                           Will try to take the ID from Dataproc Hook object if ot specified.
        :type cluster_id: str
        :param name: Name of the job. Used for labeling.
        :type name: str
        :param packages: List of maven coordinates of jars to include on the driver and executor classpaths.
        :type packages: List[str]
        :param repositories: List of additional remote repositories to search for the maven
            coordinates given with --packages.
        :type repositories: List[str]
        :param exclude_packages: List of groupId:artifactId, to exclude while resolving the
            dependencies provided in --packages to avoid dependency conflicts.
        :type exclude_packages: List[str]
        """
        cluster_id = cluster_id or self.cluster_id
        if not cluster_id:
            raise RuntimeError("Cluster id must be specified.")
        self.log.info("Running Pyspark job. Cluster ID: {cluster_id}".format(
            cluster_id=cluster_id))
        request = job_service_pb.CreateJobRequest(
            cluster_id=cluster_id,
            name=name,
            pyspark_job=job_pb.PysparkJob(
                main_python_file_uri=main_python_file_uri,
                python_file_uris=python_file_uris,
                file_uris=file_uris,
                archive_uris=archive_uris,
                jar_file_uris=jar_file_uris,
                args=args,
                properties=properties,
                packages=packages,
                repositories=repositories,
                exclude_packages=exclude_packages,
            ),
        )
        return self.sdk.create_operation_and_get_result(
            request,
            service=job_service_grpc_pb.JobServiceStub,
            method_name="Create",
            response_type=job_pb.Job,
            meta_type=job_service_pb.CreateJobMetadata,
        )