def run_spark_job(sdk, cluster_id, bucket): print('Running Spark job {}'.format(cluster_id)) operation = sdk.client(job_service_grpc_pb.JobServiceStub).Create( job_service_pb.CreateJobRequest( cluster_id=cluster_id, name= 'Spark job: Find total urban population in distribution by country', spark_job=job_pb.SparkJob( main_jar_file_uri= 's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar', main_class= 'ru.yandex.cloud.dataproc.examples.PopulationSparkJob', file_uris=[ 's3a://data-proc-public/jobs/sources/data/config.json', ], archive_uris=[ 's3a://data-proc-public/jobs/sources/data/country-codes.csv.zip', ], jar_file_uris=[ 's3a://data-proc-public/jobs/sources/java/icu4j-61.1.jar', 's3a://data-proc-public/jobs/sources/java/commons-lang-2.6.jar', 's3a://data-proc-public/jobs/sources/java/opencsv-4.1.jar', 's3a://data-proc-public/jobs/sources/java/json-20190722.jar' ], args=[ 's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2', 's3a://{bucket}/dataproc/job/results/${{JOB_ID}}'.format( bucket=bucket), ], properties={ 'spark.submit.deployMode': 'cluster', }, ))) wait_for_operation(sdk, operation) return operation
def run_pyspark_job(sdk, cluster_id, bucket): print('Running Pyspark job {}'.format(cluster_id)) operation = sdk.client(job_service_grpc_pb.JobServiceStub).Create( job_service_pb.CreateJobRequest( cluster_id=cluster_id, name='Pyspark job', pyspark_job=job_pb.PysparkJob( main_python_file_uri= 's3a://data-proc-public/jobs/sources/pyspark-001/main.py', python_file_uris=[ 's3a://data-proc-public/jobs/sources/pyspark-001/geonames.py', ], file_uris=[ 's3a://data-proc-public/jobs/sources/data/config.json', ], archive_uris=[ 's3a://data-proc-public/jobs/sources/data/country-codes.csv.zip', ], args=[ 's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2', 's3a://{bucket}/jobs/results/${{JOB_ID}}'.format( bucket=bucket), ], jar_file_uris=[ 's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar', 's3a://data-proc-public/jobs/sources/java/icu4j-61.1.jar', 's3a://data-proc-public/jobs/sources/java/commons-lang-2.6.jar', ], properties={ 'spark.submit.deployMode': 'cluster', }, ))) wait_for_operation(sdk, operation) return operation
def create_spark_job( self, main_jar_file_uri=None, main_class=None, file_uris=None, archive_uris=None, jar_file_uris=None, args=None, properties=None, cluster_id=None, name='Spark job', ): """ Run Spark job in Yandex.Cloud Data Proc cluster. :param main_jar_file_uri: URI of jar file with job. Can be placed in HDFS or S3. :type main_class: str :param main_class: Name of the main class of the job. :type main_class: str :param file_uris: URIs of files used in the job. Can be placed in HDFS or S3. :type file_uris: List[str] :param archive_uris: URIs of archive files used in the job. Can be placed in HDFS or S3. :type archive_uris: List[str] :param jar_file_uris: URIs of JAR files used in the job. Can be placed in HDFS or S3. :type archive_uris: List[str] :param properties: Properties for the job. :type properties: Dist[str, str] :param args: Arguments to be passed to the job. :type args: List[str] :param cluster_id: ID of the cluster to run job in. Will try to take the ID from Dataproc Hook object if ot specified. :type cluster_id: str :param name: Name of the job. Used for labeling. :type name: str """ cluster_id = cluster_id or self.cluster_id if not cluster_id: raise RuntimeError('Cluster id must be specified.') self.log.info('Running Spark job. Cluster ID: {cluster_id}'.format( cluster_id=cluster_id)) request = job_service_pb.CreateJobRequest( cluster_id=cluster_id, name=name, spark_job=job_pb.SparkJob( main_jar_file_uri=main_jar_file_uri, main_class=main_class, file_uris=file_uris, archive_uris=archive_uris, jar_file_uris=jar_file_uris, args=args, properties=properties, )) return self.sdk.create_operation_and_get_result( request, service=job_service_grpc_pb.JobServiceStub, method_name='Create', response_type=job_pb.Job, meta_type=job_service_pb.CreateJobMetadata, )
def run_mapreduce_job(sdk, cluster_id, bucket): print('Running Mapreduce job {}'.format(cluster_id)) operation = sdk.client(job_service_grpc_pb.JobServiceStub).Create( job_service_pb.CreateJobRequest( cluster_id=cluster_id, name='Mapreduce job 1', mapreduce_job=job_pb.MapreduceJob( main_class='org.apache.hadoop.streaming.HadoopStreaming', file_uris=[ 's3a://data-proc-public/jobs/sources/mapreduce-001/mapper.py', 's3a://data-proc-public/jobs/sources/mapreduce-001/reducer.py' ], args=[ '-mapper', 'mapper.py', '-reducer', 'reducer.py', '-numReduceTasks', '1', '-input', 's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2', '-output', 's3a://{bucket}/dataproc/job/results'.format(bucket=bucket) ], properties={ 'yarn.app.mapreduce.am.resource.mb': '2048', 'yarn.app.mapreduce.am.command-opts': '-Xmx2048m', 'mapreduce.job.maps': '6', }, ))) wait_for_operation(sdk, operation) return operation
def run_hive_job(self, cluster_id): print('Running Hive job {}'.format(cluster_id)) operation = self.client(job_service_grpc_pb.JobServiceStub).Create( job_service_pb.CreateJobRequest( cluster_id=cluster_id, name='Hive job 1', hive_job=job_pb.HiveJob( query_file_uri= 's3a://data-proc-public/jobs/sources/hive-001/main.sql', script_variables={ 'CITIES_URI': 's3a://data-proc-public/jobs/sources/hive-001/cities/', 'COUNTRY_CODE': 'RU', }))) wait_for_operation(self, operation) return operation
def run_hive_job(sdk, cluster_id): logging.info('Running Hive job {}'.format(cluster_id)) operation = sdk.client(job_service_grpc_pb.JobServiceStub).Create( job_service_pb.CreateJobRequest( cluster_id=cluster_id, name='Hive job 1', hive_job=job_pb.HiveJob( query_file_uri='s3a://data-proc-public/jobs/sources/hive-001/main.sql', script_variables={ 'CITIES_URI': 's3a://data-proc-public/jobs/sources/hive-001/cities/', 'COUNTRY_CODE': 'RU', } ) ) ) return sdk.wait_operation_and_get_result(operation, response_type=job_pb.Job, meta_type=job_service_pb.CreateJobMetadata)
def run_spark_job(sdk, cluster_id, bucket): logging.info('Running Spark job {}'.format(cluster_id)) operation = sdk.client(job_service_grpc_pb.JobServiceStub).Create( job_service_pb.CreateJobRequest( cluster_id=cluster_id, name= 'Spark job: Find total urban population in distribution by country', spark_job=job_pb.SparkJob( main_jar_file_uri= 's3a://data-proc-public/jobs/sources/java/dataproc-examples-1.0.jar', main_class= 'ru.yandex.cloud.dataproc.examples.PopulationSparkJob', file_uris=[ 's3a://data-proc-public/jobs/sources/data/config.json', ], archive_uris=[ 's3a://data-proc-public/jobs/sources/data/country-codes.csv.zip', ], jar_file_uris=[ 's3a://data-proc-public/jobs/sources/java/icu4j-61.1.jar', 's3a://data-proc-public/jobs/sources/java/commons-lang-2.6.jar', 's3a://data-proc-public/jobs/sources/java/opencsv-4.1.jar', 's3a://data-proc-public/jobs/sources/java/json-20190722.jar' ], args=[ 's3a://data-proc-public/jobs/sources/data/cities500.txt.bz2', 's3a://{bucket}/dataproc/job/results/${{JOB_ID}}'.format( bucket=bucket), ], properties={ 'spark.submit.deployMode': 'cluster', }, packages=['org.slf4j:slf4j-simple:1.7.30'], repositories=['https://repo1.maven.org/maven2'], exclude_packages=['com.amazonaws:amazon-kinesis-client'], ))) return sdk.wait_operation_and_get_result( operation, response_type=job_pb.Job, meta_type=job_service_pb.CreateJobMetadata)
def create_hive_job( self, query=None, query_file_uri=None, script_variables=None, continue_on_failure=False, properties=None, cluster_id=None, name='Hive job', ): """ Run Hive job in Yandex.Cloud Data Proc cluster. :param query: Hive query. :type query: str :param query_file_uri: URI of the script that contains Hive queries. Can be placed in HDFS or S3. :type query_file_uri: str :param properties: A mapping of property names to values, used to configure Hive. :type properties: Dist[str, str] :param script_variables: Mapping of query variable names to values. :type script_variables: Dist[str, str] :param continue_on_failure: Whether to continue executing queries if a query fails. :type continue_on_failure: boole :param cluster_id: ID of the cluster to run job in. Will try to take the ID from Dataproc Hook object if ot specified. :type cluster_id: str :param name: Name of the job. Used for labeling. :type name: str """ cluster_id = cluster_id or self.cluster_id if not cluster_id: raise RuntimeError('Cluster id must be specified.') if (query and query_file_uri) or not (query or query_file_uri): raise RuntimeError( 'Either query or query_file_uri must be specified.') self.log.info('Running Hive job. Cluster ID: {cluster_id}'.format( cluster_id=cluster_id)) hive_job = job_pb.HiveJob( query_file_uri=query_file_uri, script_variables=script_variables, continue_on_failure=continue_on_failure, properties=properties, ) if query: hive_job = job_pb.HiveJob( query_list=job_pb.QueryList(queries=query.split('\n')), script_variables=script_variables, continue_on_failure=continue_on_failure, properties=properties, ) request = job_service_pb.CreateJobRequest( cluster_id=cluster_id, name=name, hive_job=hive_job, ) return self.sdk.create_operation_and_get_result( request, service=job_service_grpc_pb.JobServiceStub, method_name='Create', response_type=job_pb.Job, meta_type=job_service_pb.CreateJobMetadata, )
def create_pyspark_job( self, main_python_file_uri=None, python_file_uris=None, file_uris=None, archive_uris=None, jar_file_uris=None, args=None, properties=None, cluster_id=None, name="Pyspark job", packages=None, repositories=None, exclude_packages=None, ): """ Run Pyspark job in Yandex.Cloud Data Proc cluster. :param main_python_file_uri: URI of python file with job. Can be placed in HDFS or S3. :type main_python_file_uri: str :param python_file_uris: URIs of python files used in the job. Can be placed in HDFS or S3. :type python_file_uris: List[str] :param file_uris: URIs of files used in the job. Can be placed in HDFS or S3. :type file_uris: List[str] :param archive_uris: URIs of archive files used in the job. Can be placed in HDFS or S3. :type archive_uris: List[str] :param jar_file_uris: URIs of JAR files used in the job. Can be placed in HDFS or S3. :type archive_uris: List[str] :param properties: Properties for the job. :type properties: Dist[str, str] :param args: Arguments to be passed to the job. :type args: List[str] :param cluster_id: ID of the cluster to run job in. Will try to take the ID from Dataproc Hook object if ot specified. :type cluster_id: str :param name: Name of the job. Used for labeling. :type name: str :param packages: List of maven coordinates of jars to include on the driver and executor classpaths. :type packages: List[str] :param repositories: List of additional remote repositories to search for the maven coordinates given with --packages. :type repositories: List[str] :param exclude_packages: List of groupId:artifactId, to exclude while resolving the dependencies provided in --packages to avoid dependency conflicts. :type exclude_packages: List[str] """ cluster_id = cluster_id or self.cluster_id if not cluster_id: raise RuntimeError("Cluster id must be specified.") self.log.info("Running Pyspark job. Cluster ID: {cluster_id}".format( cluster_id=cluster_id)) request = job_service_pb.CreateJobRequest( cluster_id=cluster_id, name=name, pyspark_job=job_pb.PysparkJob( main_python_file_uri=main_python_file_uri, python_file_uris=python_file_uris, file_uris=file_uris, archive_uris=archive_uris, jar_file_uris=jar_file_uris, args=args, properties=properties, packages=packages, repositories=repositories, exclude_packages=exclude_packages, ), ) return self.sdk.create_operation_and_get_result( request, service=job_service_grpc_pb.JobServiceStub, method_name="Create", response_type=job_pb.Job, meta_type=job_service_pb.CreateJobMetadata, )