def test_construct_same_perm_dicts(): int_perm_dict_1 = PermissiveDict(fields={'an_int': Field(int)}) int_perm_dict_2 = PermissiveDict(fields={'an_int': Field(int)}) # assert identical object assert int_perm_dict_1 is int_perm_dict_2 # assert equivalent key assert int_perm_dict_1.key == int_perm_dict_2.key
def test_invalid_permissive_dict_field(): with pytest.raises(DagsterInvalidDefinitionError) as exc_info: PermissiveDict({'val': Int, 'another_val': Field(Int)}) assert str(exc_info.value) == ( 'You have passed a config type "Int" in the parameter "fields" and it is ' 'in the "val" entry of that dict. It is from a PermissiveDict with fields ' '[\'another_val\', \'val\']. You have likely ' 'forgot to wrap this type in a Field.')
def _define_configurations(): return Field( List[ Dict( fields={ 'Classification': Field( String, description='The classification within a configuration.', is_optional=True, ), 'Configurations': Field( List[PermissiveDict()], description='''A list of additional configurations to apply within a configuration object.''', is_optional=True, ), 'Properties': Field( PermissiveDict(), description='''A set of properties specified within a configuration classification.''', is_optional=True, ), } ) ], description='''For Amazon EMR releases 4.0 and later. The list of configurations supplied for the EMR cluster you are creating. An optional configuration specification to be used when provisioning cluster instances, which can include configurations for applications and software bundled with Amazon EMR. A configuration consists of a classification, properties, and optional nested configurations. A classification refers to an application-specific configuration file. Properties are the settings you want to change in that file. For more information, see the EMR Configuring Applications guide.''', is_optional=True, )
def test_apply_default_values(): scalar_config_type = resolve_to_config_type(String) assert apply_default_values(scalar_config_type, 'foo') == 'foo' assert apply_default_values(scalar_config_type, 3) == 3 assert apply_default_values(scalar_config_type, {}) == {} assert apply_default_values(scalar_config_type, None) is None enum_config_type = resolve_to_config_type( Enum('an_enum', [EnumValue('foo'), EnumValue('bar', python_value=3)])) assert apply_default_values(enum_config_type, 'foo') == 'foo' assert apply_default_values(enum_config_type, 'bar') == 3 with pytest.raises(CheckError, match='config_value should be pre-validated'): apply_default_values(enum_config_type, 'baz') with pytest.raises(CheckError, match='config_value should be pre-validated'): apply_default_values(enum_config_type, None) list_config_type = resolve_to_config_type(List[String]) assert apply_default_values(list_config_type, ['foo']) == ['foo'] assert apply_default_values(list_config_type, None) == [] with pytest.raises(CheckError, match='Null list member not caught'): assert apply_default_values(list_config_type, [None]) == [None] nullable_list_config_type = resolve_to_config_type(List[Optional[String]]) assert apply_default_values(nullable_list_config_type, ['foo']) == ['foo'] assert apply_default_values(nullable_list_config_type, [None]) == [None] assert apply_default_values(nullable_list_config_type, None) == [] composite_config_type = resolve_to_config_type( Dict({ 'foo': Field(String), 'bar': Field(Dict({'baz': Field(List[String])})), 'quux': Field(String, is_optional=True, default_value='zip'), 'quiggle': Field(String, is_optional=True), })) with pytest.raises(CheckError, match='Missing non-optional composite member'): apply_default_values(composite_config_type, {}) with pytest.raises(CheckError, match='Missing non-optional composite member'): apply_default_values(composite_config_type, { 'bar': { 'baz': ['giraffe'] }, 'quux': 'nimble' }) with pytest.raises(CheckError, match='Missing non-optional composite member'): apply_default_values(composite_config_type, { 'foo': 'zowie', 'quux': 'nimble' }) assert apply_default_values(composite_config_type, { 'foo': 'zowie', 'bar': { 'baz': ['giraffe'] }, 'quux': 'nimble' }) == { 'foo': 'zowie', 'bar': { 'baz': ['giraffe'] }, 'quux': 'nimble' } assert apply_default_values(composite_config_type, { 'foo': 'zowie', 'bar': { 'baz': ['giraffe'] } }) == { 'foo': 'zowie', 'bar': { 'baz': ['giraffe'] }, 'quux': 'zip' } assert apply_default_values(composite_config_type, { 'foo': 'zowie', 'bar': { 'baz': ['giraffe'] }, 'quiggle': 'squiggle' }) == { 'foo': 'zowie', 'bar': { 'baz': ['giraffe'] }, 'quux': 'zip', 'quiggle': 'squiggle' } nested_composite_config_type = resolve_to_config_type( Dict({ 'fruts': Field( Dict({ 'apple': Field(String), 'banana': Field(String, is_optional=True), 'potato': Field(String, is_optional=True, default_value='pie'), })) })) with pytest.raises(CheckError, match='Missing non-optional composite member'): apply_default_values(nested_composite_config_type, {'fruts': None}) with pytest.raises(CheckError, match='Missing non-optional composite member'): apply_default_values(nested_composite_config_type, {'fruts': { 'banana': 'good', 'potato': 'bad' }}) assert apply_default_values(nested_composite_config_type, {'fruts': { 'apple': 'strawberry' }}) == { 'fruts': { 'apple': 'strawberry', 'potato': 'pie' } } assert apply_default_values( nested_composite_config_type, {'fruts': { 'apple': 'a', 'banana': 'b', 'potato': 'c' }}) == { 'fruts': { 'apple': 'a', 'banana': 'b', 'potato': 'c' } } any_config_type = resolve_to_config_type(Any) assert apply_default_values(any_config_type, {'foo': 'bar'}) == { 'foo': 'bar' } with pytest.raises(CheckError, match='Unsupported type'): assert apply_default_values( ConfigType('gargle', 'bargle', ConfigTypeKind.REGULAR), 3) selector_config_type = resolve_to_config_type( Selector({ 'one': Field(String), 'another': Field( Dict({ 'foo': Field(String, default_value='bar', is_optional=True) })), 'yet_another': Field(String, default_value='quux', is_optional=True), })) with pytest.raises(CheckError): apply_default_values(selector_config_type, 'one') with pytest.raises(ParameterCheckError): apply_default_values(selector_config_type, None) with pytest.raises(ParameterCheckError, match='Expected dict with single item'): apply_default_values(selector_config_type, {}) with pytest.raises(CheckError): apply_default_values(selector_config_type, { 'one': 'foo', 'another': 'bar' }) assert apply_default_values(selector_config_type, {'one': 'foo'}) == { 'one': 'foo' } assert apply_default_values(selector_config_type, {'one': None}) == { 'one': None } assert apply_default_values(selector_config_type, {'one': {}}) == { 'one': {} } assert apply_default_values(selector_config_type, {'another': {}}) == { 'another': { 'foo': 'bar' } } singleton_selector_config_type = resolve_to_config_type( Selector({'foo': Field(String, default_value='bar', is_optional=True)})) assert apply_default_values(singleton_selector_config_type, None) == { 'foo': 'bar' } permissive_dict_config_type = resolve_to_config_type( PermissiveDict({ 'foo': Field(String), 'bar': Field(String, default_value='baz', is_optional=True) })) with pytest.raises(CheckError, match='Missing non-optional composite member'): apply_default_values(permissive_dict_config_type, None) assert apply_default_values(permissive_dict_config_type, { 'foo': 'wow', 'mau': 'mau' }) == { 'foo': 'wow', 'bar': 'baz', 'mau': 'mau', }
def define_dataproc_job_config(): return Field( Dict( fields={ 'pysparkJob': Field( Dict( fields={ 'mainPythonFileUri': Field( String, description= '''Required. The HCFS URI of the main Python file to use as the driver. Must be a .py file.''', is_optional=True, ), 'archiveUris': Field( List[String], description= '''Optional. HCFS URIs of archives to be extracted in the working directory of .jar, .tar, .tar.gz, .tgz, and .zip.''', is_optional=True, ), 'jarFileUris': Field( List[String], description= '''Optional. HCFS URIs of jar files to add to the CLASSPATHs of the Python driver and tasks.''', is_optional=True, ), 'loggingConfig': Field( Dict( fields={ 'driverLogLevels': Field( PermissiveDict(), description= '''The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: \'com.google = FATAL\', \'root = INFO\', \'org.apache = DEBUG\'''', is_optional=True, ) }), description= '''The runtime logging config of the job.''', is_optional=True, ), 'properties': Field( PermissiveDict(), description= '''Optional. A mapping of property names to values, used to configure PySpark. Properties that conflict with values set by the Cloud Dataproc API may be overwritten. Can include properties set in /etc/spark/conf/spark-defaults.conf and classes in user code.''', is_optional=True, ), 'args': Field( List[String], description= '''Optional. The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission.''', is_optional=True, ), 'fileUris': Field( List[String], description= '''Optional. HCFS URIs of files to be copied to the working directory of Python drivers and distributed tasks. Useful for naively parallel tasks.''', is_optional=True, ), 'pythonFileUris': Field( List[String], description= '''Optional. HCFS file URIs of Python files to pass to the PySpark framework. Supported file types: .py, .egg, and .zip.''', is_optional=True, ), }), description= '''A Cloud Dataproc job for running Apache PySpark (https://spark.apache.org/docs/0.9.0/python-programming-guide.html) applications on YARN.''', is_optional=True, ), 'reference': Field( Dict( fields={ 'projectId': Field( String, description= '''Required. The ID of the Google Cloud Platform project that the job belongs to.''', is_optional=True, ), 'jobId': Field( String, description= '''Optional. The job ID, which must be unique within the project.The ID must contain only letters (a-z, A-Z), numbers (0-9), underscores (_), or hyphens (-). The maximum length is 100 characters.If not specified by the caller, the job ID will be provided by the server.''', is_optional=True, default_value='dagster-job-' + str(uuid.uuid4()), ), }), description= '''Encapsulates the full scoping used to reference a job.''', is_optional=True, ), 'hadoopJob': Field( Dict( fields={ 'jarFileUris': Field( List[String], description= '''Optional. Jar file URIs to add to the CLASSPATHs of the Hadoop driver and tasks.''', is_optional=True, ), 'loggingConfig': Field( Dict( fields={ 'driverLogLevels': Field( PermissiveDict(), description= '''The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: \'com.google = FATAL\', \'root = INFO\', \'org.apache = DEBUG\'''', is_optional=True, ) }), description= '''The runtime logging config of the job.''', is_optional=True, ), 'properties': Field( PermissiveDict(), description= '''Optional. A mapping of property names to values, used to configure Hadoop. Properties that conflict with values set by the Cloud Dataproc API may be overwritten. Can include properties set in /etc/hadoop/conf/*-site and classes in user code.''', is_optional=True, ), 'args': Field( List[String], description= '''Optional. The arguments to pass to the driver. Do not include arguments, such as -libjars or -Dfoo=bar, that can be set as job properties, since a collision may occur that causes an incorrect job submission.''', is_optional=True, ), 'fileUris': Field( List[String], description= '''Optional. HCFS (Hadoop Compatible Filesystem) URIs of files to be copied to the working directory of Hadoop drivers and distributed tasks. Useful for naively parallel tasks.''', is_optional=True, ), 'mainClass': Field( String, description= '''The name of the driver\'s main class. The jar file containing the class must be in the default CLASSPATH or specified in jar_file_uris.''', is_optional=True, ), 'archiveUris': Field( List[String], description= '''Optional. HCFS URIs of archives to be extracted in the working directory of Hadoop drivers and tasks. Supported file types: .jar, .tar, .tar.gz, .tgz, or .zip.''', is_optional=True, ), 'mainJarFileUri': Field( String, description= '''The HCFS URI of the jar file containing the main class. Examples: \'gs://foo-bucket/analytics-binaries/extract-useful-metrics-mr.jar\' \'hdfs:/tmp/test-samples/custom-wordcount.jar\' \'file:///home/usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar\'''', is_optional=True, ), }), description= '''A Cloud Dataproc job for running Apache Hadoop MapReduce (https://hadoop.apache.org/docs/current/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html) jobs on Apache Hadoop YARN (https://hadoop.apache.org/docs/r2.7.1/hadoop-yarn/hadoop-yarn-site/YARN.html).''', is_optional=True, ), 'status': Field(Dict(fields={}), description='''Cloud Dataproc job status.''', is_optional=True), 'placement': Field( Dict( fields={ 'clusterName': Field( String, description= '''Required. The name of the cluster where the job will be submitted.''', is_optional=True, ) }), description='''Cloud Dataproc job config.''', is_optional=True, ), 'scheduling': Field( Dict( fields={ 'maxFailuresPerHour': Field( Int, description= '''Optional. Maximum number of times per hour a driver may be restarted as a result of driver terminating with non-zero code before job is reported failed.A job may be reported as thrashing if driver exits with non-zero code 4 times within 10 minute window.Maximum value is 10.''', is_optional=True, ) }), description='''Job scheduling options.''', is_optional=True, ), 'pigJob': Field( Dict( fields={ 'queryFileUri': Field( String, description= '''The HCFS URI of the script that contains the Pig queries.''', is_optional=True, ), 'queryList': Field( Dict( fields={ 'queries': Field( List[String], description= '''Required. The queries to execute. You do not need to terminate a query with a semicolon. Multiple queries can be specified in one string by separating each with a semicolon. Here is an example of an Cloud Dataproc API snippet that uses a QueryList to specify a HiveJob: "hiveJob": { "queryList": { "queries": [ "query1", "query2", "query3;query4", ] } } ''', is_optional=True, ) }), description= '''A list of queries to run on a cluster.''', is_optional=True, ), 'jarFileUris': Field( List[String], description= '''Optional. HCFS URIs of jar files to add to the CLASSPATH of the Pig Client and Hadoop MapReduce (MR) tasks. Can contain Pig UDFs.''', is_optional=True, ), 'scriptVariables': Field( PermissiveDict(), description= '''Optional. Mapping of query variable names to values (equivalent to the Pig command: name=[value]).''', is_optional=True, ), 'loggingConfig': Field( Dict( fields={ 'driverLogLevels': Field( PermissiveDict(), description= '''The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: \'com.google = FATAL\', \'root = INFO\', \'org.apache = DEBUG\'''', is_optional=True, ) }), description= '''The runtime logging config of the job.''', is_optional=True, ), 'properties': Field( PermissiveDict(), description= '''Optional. A mapping of property names to values, used to configure Pig. Properties that conflict with values set by the Cloud Dataproc API may be overwritten. Can include properties set in /etc/hadoop/conf/*-site.xml, /etc/pig/conf/pig.properties, and classes in user code.''', is_optional=True, ), 'continueOnFailure': Field( Bool, description= '''Optional. Whether to continue executing queries if a query fails. The default value is false. Setting to true can be useful when executing independent parallel queries.''', is_optional=True, ), }), description='''A Cloud Dataproc job for running Apache Pig (https://pig.apache.org/) queries on YARN.''', is_optional=True, ), 'hiveJob': Field( Dict( fields={ 'queryFileUri': Field( String, description= '''The HCFS URI of the script that contains Hive queries.''', is_optional=True, ), 'queryList': Field( Dict( fields={ 'queries': Field( List[String], description= '''Required. The queries to execute. You do not need to terminate a query with a semicolon. Multiple queries can be specified in one string by separating each with a semicolon. Here is an example of an Cloud Dataproc API snippet that uses a QueryList to specify a HiveJob: "hiveJob": { "queryList": { "queries": [ "query1", "query2", "query3;query4", ] } } ''', is_optional=True, ) }), description= '''A list of queries to run on a cluster.''', is_optional=True, ), 'jarFileUris': Field( List[String], description= '''Optional. HCFS URIs of jar files to add to the CLASSPATH of the Hive server and Hadoop MapReduce (MR) tasks. Can contain Hive SerializationStrategys and UDFs.''', is_optional=True, ), 'scriptVariables': Field( PermissiveDict(), description= '''Optional. Mapping of query variable names to values (equivalent to the Hive command: SET name="value";).''', is_optional=True, ), 'properties': Field( PermissiveDict(), description= '''Optional. A mapping of property names and values, used to configure Hive. Properties that conflict with values set by the Cloud Dataproc API may be overwritten. Can include properties set in /etc/hadoop/conf/*-site.xml, /etc/hive/conf/hive-site.xml, and classes in user code.''', is_optional=True, ), 'continueOnFailure': Field( Bool, description= '''Optional. Whether to continue executing queries if a query fails. The default value is false. Setting to true can be useful when executing independent parallel queries.''', is_optional=True, ), }), description='''A Cloud Dataproc job for running Apache Hive (https://hive.apache.org/) queries on YARN.''', is_optional=True, ), 'labels': Field( PermissiveDict(), description= '''Optional. The labels to associate with this job. Label keys must contain 1 to 63 characters, and must conform to RFC 1035 (https://www.ietf.org/rfc/rfc1035.txt). Label values may be empty, but, if present, must contain 1 to 63 characters, and must conform to RFC 1035 (https://www.ietf.org/rfc/rfc1035.txt). No more than 32 labels can be associated with a job.''', is_optional=True, ), 'sparkSqlJob': Field( Dict( fields={ 'queryFileUri': Field( String, description= '''The HCFS URI of the script that contains SQL queries.''', is_optional=True, ), 'queryList': Field( Dict( fields={ 'queries': Field( List[String], description= '''Required. The queries to execute. You do not need to terminate a query with a semicolon. Multiple queries can be specified in one string by separating each with a semicolon. Here is an example of an Cloud Dataproc API snippet that uses a QueryList to specify a HiveJob: "hiveJob": { "queryList": { "queries": [ "query1", "query2", "query3;query4", ] } } ''', is_optional=True, ) }), description= '''A list of queries to run on a cluster.''', is_optional=True, ), 'scriptVariables': Field( PermissiveDict(), description= '''Optional. Mapping of query variable names to values (equivalent to the Spark SQL command: SET name="value";).''', is_optional=True, ), 'jarFileUris': Field( List[String], description= '''Optional. HCFS URIs of jar files to be added to the Spark CLASSPATH.''', is_optional=True, ), 'loggingConfig': Field( Dict( fields={ 'driverLogLevels': Field( PermissiveDict(), description= '''The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: \'com.google = FATAL\', \'root = INFO\', \'org.apache = DEBUG\'''', is_optional=True, ) }), description= '''The runtime logging config of the job.''', is_optional=True, ), 'properties': Field( PermissiveDict(), description= '''Optional. A mapping of property names to values, used to configure Spark SQL\'s SparkConf. Properties that conflict with values set by the Cloud Dataproc API may be overwritten.''', is_optional=True, ), }), description= '''A Cloud Dataproc job for running Apache Spark SQL (http://spark.apache.org/sql/) queries.''', is_optional=True, ), 'sparkJob': Field( Dict( fields={ 'mainJarFileUri': Field( String, description= '''The HCFS URI of the jar file that contains the main class.''', is_optional=True, ), 'jarFileUris': Field( List[String], description= '''Optional. HCFS URIs of jar files to add to the CLASSPATHs of the Spark driver and tasks.''', is_optional=True, ), 'loggingConfig': Field( Dict( fields={ 'driverLogLevels': Field( PermissiveDict(), description= '''The per-package log levels for the driver. This may include "root" package name to configure rootLogger. Examples: \'com.google = FATAL\', \'root = INFO\', \'org.apache = DEBUG\'''', is_optional=True, ) }), description= '''The runtime logging config of the job.''', is_optional=True, ), 'properties': Field( PermissiveDict(), description= '''Optional. A mapping of property names to values, used to configure Spark. Properties that conflict with values set by the Cloud Dataproc API may be overwritten. Can include properties set in /etc/spark/conf/spark-defaults.conf and classes in user code.''', is_optional=True, ), 'args': Field( List[String], description= '''Optional. The arguments to pass to the driver. Do not include arguments, such as --conf, that can be set as job properties, since a collision may occur that causes an incorrect job submission.''', is_optional=True, ), 'fileUris': Field( List[String], description= '''Optional. HCFS URIs of files to be copied to the working directory of Spark drivers and distributed tasks. Useful for naively parallel tasks.''', is_optional=True, ), 'mainClass': Field( String, description= '''The name of the driver\'s main class. The jar file that contains the class must be in the default CLASSPATH or specified in jar_file_uris.''', is_optional=True, ), 'archiveUris': Field( List[String], description= '''Optional. HCFS URIs of archives to be extracted in the working directory of Spark drivers and tasks. Supported file types: .jar, .tar, .tar.gz, .tgz, and .zip.''', is_optional=True, ), }), description='''A Cloud Dataproc job for running Apache Spark (http://spark.apache.org/) applications on YARN.''', is_optional=True, ), }), description='''A Cloud Dataproc job resource.''', is_optional=True, )
}), is_optional=True, default_value={'en': { 'whom': 'world' }}, )) def hello_world_default(context) -> str: if 'haw' in context.solid_config: return 'Aloha {whom}!'.format(whom=context.solid_config['haw']['whom']) if 'cn' in context.solid_config: return '你好,{whom}!'.format(whom=context.solid_config['cn']['whom']) if 'en' in context.solid_config: return 'Hello, {whom}!'.format(whom=context.solid_config['en']['whom']) @solid(config=Field(PermissiveDict({'required': Field(String)}))) def partially_specified_config(context) -> List: return sorted(list(context.solid_config.items())) def test_any_config(): res = execute_solid( any_config, environment_dict={'solids': { 'any_config': { 'config': 'foo' } }}) assert res.output_value() == 'foo' res = execute_solid(any_config,
def _multiple_required_fields_config_permissive_dict(): return Field( PermissiveDict({ 'field_one': Field(String), 'field_two': Field(String) }))
def define_dataproc_cluster_config(): return Field( Dict( fields={ 'masterConfig': Field( Dict( fields={ 'accelerators': Field( List[ Dict( fields={ 'acceleratorCount': Field( Int, description='''The number of the accelerator cards of this type exposed to this instance.''', is_optional=True, ), 'acceleratorTypeUri': Field( String, description='''Full URL, partial URI, or short name of the accelerator type resource to expose to this instance. See Compute Engine AcceleratorTypes.Examples: https://www.googleapis.com/compute/beta/projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80 projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80 nvidia-tesla-k80Auto Zone Exception: If you are using the Cloud Dataproc Auto Zone Placement feature, you must use the short name of the accelerator type resource, for example, nvidia-tesla-k80.''', is_optional=True, ), } ) ], description='''Optional. The Compute Engine accelerator configuration for these instances.Beta Feature: This feature is still under development. It may be changed before final release.''', is_optional=True, ), 'numInstances': Field( Int, description='''Optional. The number of VM instances in the instance group. For master instance groups, must be set to 1.''', is_optional=True, ), 'diskConfig': Field( Dict( fields={ 'numLocalSsds': Field( Int, description='''Optional. Number of attached SSDs, from 0 to 4 (default is 0). If SSDs are not attached, the boot disk is used to store runtime logs and HDFS (https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html) data. If one or more SSDs are attached, this runtime bulk data is spread across them, and the boot disk contains only basic config and installed binaries.''', is_optional=True, ), 'bootDiskSizeGb': Field( Int, description='''Optional. Size in GB of the boot disk (default is 500GB).''', is_optional=True, ), 'bootDiskType': Field( String, description='''Optional. Type of the boot disk (default is "pd-standard"). Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk Hard Disk Drive).''', is_optional=True, ), } ), description='''Specifies the config of disk options for a group of VM instances.''', is_optional=True, ), 'managedGroupConfig': Field( Dict(fields={}), description='''Specifies the resources used to actively manage an instance group.''', is_optional=True, ), 'isPreemptible': Field( Bool, description='''Optional. Specifies that this instance group contains preemptible instances.''', is_optional=True, ), 'imageUri': Field( String, description='''Optional. The Compute Engine image resource used for cluster instances. It can be specified or may be inferred from SoftwareConfig.image_version.''', is_optional=True, ), 'machineTypeUri': Field( String, description='''Optional. The Compute Engine machine type used for cluster instances.A full URL, partial URI, or short name are valid. Examples: https://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2 projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2 n1-standard-2Auto Zone Exception: If you are using the Cloud Dataproc Auto Zone Placement feature, you must use the short name of the machine type resource, for example, n1-standard-2.''', is_optional=True, ), } ), description='''Optional. The config settings for Compute Engine resources in an instance group, such as a master or worker group.''', is_optional=True, ), 'secondaryWorkerConfig': Field( Dict( fields={ 'accelerators': Field( List[ Dict( fields={ 'acceleratorCount': Field( Int, description='''The number of the accelerator cards of this type exposed to this instance.''', is_optional=True, ), 'acceleratorTypeUri': Field( String, description='''Full URL, partial URI, or short name of the accelerator type resource to expose to this instance. See Compute Engine AcceleratorTypes.Examples: https://www.googleapis.com/compute/beta/projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80 projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80 nvidia-tesla-k80Auto Zone Exception: If you are using the Cloud Dataproc Auto Zone Placement feature, you must use the short name of the accelerator type resource, for example, nvidia-tesla-k80.''', is_optional=True, ), } ) ], description='''Optional. The Compute Engine accelerator configuration for these instances.Beta Feature: This feature is still under development. It may be changed before final release.''', is_optional=True, ), 'numInstances': Field( Int, description='''Optional. The number of VM instances in the instance group. For master instance groups, must be set to 1.''', is_optional=True, ), 'diskConfig': Field( Dict( fields={ 'numLocalSsds': Field( Int, description='''Optional. Number of attached SSDs, from 0 to 4 (default is 0). If SSDs are not attached, the boot disk is used to store runtime logs and HDFS (https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html) data. If one or more SSDs are attached, this runtime bulk data is spread across them, and the boot disk contains only basic config and installed binaries.''', is_optional=True, ), 'bootDiskSizeGb': Field( Int, description='''Optional. Size in GB of the boot disk (default is 500GB).''', is_optional=True, ), 'bootDiskType': Field( String, description='''Optional. Type of the boot disk (default is "pd-standard"). Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk Hard Disk Drive).''', is_optional=True, ), } ), description='''Specifies the config of disk options for a group of VM instances.''', is_optional=True, ), 'managedGroupConfig': Field( Dict(fields={}), description='''Specifies the resources used to actively manage an instance group.''', is_optional=True, ), 'isPreemptible': Field( Bool, description='''Optional. Specifies that this instance group contains preemptible instances.''', is_optional=True, ), 'imageUri': Field( String, description='''Optional. The Compute Engine image resource used for cluster instances. It can be specified or may be inferred from SoftwareConfig.image_version.''', is_optional=True, ), 'machineTypeUri': Field( String, description='''Optional. The Compute Engine machine type used for cluster instances.A full URL, partial URI, or short name are valid. Examples: https://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2 projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2 n1-standard-2Auto Zone Exception: If you are using the Cloud Dataproc Auto Zone Placement feature, you must use the short name of the machine type resource, for example, n1-standard-2.''', is_optional=True, ), } ), description='''Optional. The config settings for Compute Engine resources in an instance group, such as a master or worker group.''', is_optional=True, ), 'encryptionConfig': Field( Dict( fields={ 'gcePdKmsKeyName': Field( String, description='''Optional. The Cloud KMS key name to use for PD disk encryption for all instances in the cluster.''', is_optional=True, ) } ), description='''Encryption settings for the cluster.''', is_optional=True, ), 'securityConfig': Field( Dict( fields={ 'kerberosConfig': Field( Dict( fields={ 'truststorePasswordUri': Field( String, description='''Optional. The Cloud Storage URI of a KMS encrypted file containing the password to the user provided truststore. For the self-signed certificate, this password is generated by Dataproc.''', is_optional=True, ), 'enableKerberos': Field( Bool, description='''Optional. Flag to indicate whether to Kerberize the cluster.''', is_optional=True, ), 'truststoreUri': Field( String, description='''Optional. The Cloud Storage URI of the truststore file used for SSL encryption. If not provided, Dataproc will provide a self-signed certificate.''', is_optional=True, ), 'crossRealmTrustRealm': Field( String, description='''Optional. The remote realm the Dataproc on-cluster KDC will trust, should the user enable cross realm trust.''', is_optional=True, ), 'rootPrincipalPasswordUri': Field( String, description='''Required. The Cloud Storage URI of a KMS encrypted file containing the root principal password.''', is_optional=True, ), 'kmsKeyUri': Field( String, description='''Required. The uri of the KMS key used to encrypt various sensitive files.''', is_optional=True, ), 'crossRealmTrustKdc': Field( String, description='''Optional. The KDC (IP or hostname) for the remote trusted realm in a cross realm trust relationship.''', is_optional=True, ), 'crossRealmTrustSharedPasswordUri': Field( String, description='''Optional. The Cloud Storage URI of a KMS encrypted file containing the shared password between the on-cluster Kerberos realm and the remote trusted realm, in a cross realm trust relationship.''', is_optional=True, ), 'tgtLifetimeHours': Field( Int, description='''Optional. The lifetime of the ticket granting ticket, in hours. If not specified, or user specifies 0, then default value 10 will be used.''', is_optional=True, ), 'keystoreUri': Field( String, description='''Optional. The Cloud Storage URI of the keystore file used for SSL encryption. If not provided, Dataproc will provide a self-signed certificate.''', is_optional=True, ), 'keyPasswordUri': Field( String, description='''Optional. The Cloud Storage URI of a KMS encrypted file containing the password to the user provided key. For the self-signed certificate, this password is generated by Dataproc.''', is_optional=True, ), 'keystorePasswordUri': Field( String, description='''Optional. The Cloud Storage URI of a KMS encrypted file containing the password to the user provided keystore. For the self-signed certificate, this password is generated by Dataproc.''', is_optional=True, ), 'crossRealmTrustAdminServer': Field( String, description='''Optional. The admin server (IP or hostname) for the remote trusted realm in a cross realm trust relationship.''', is_optional=True, ), 'kdcDbKeyUri': Field( String, description='''Optional. The Cloud Storage URI of a KMS encrypted file containing the master key of the KDC database.''', is_optional=True, ), } ), description='''Specifies Kerberos related configuration.''', is_optional=True, ) } ), description='''Security related configuration, including Kerberos.''', is_optional=True, ), 'initializationActions': Field( List[ Dict( fields={ 'executionTimeout': Field( String, description='''Optional. Amount of time executable has to complete. Default is 10 minutes. Cluster creation fails with an explanatory error message (the name of the executable that caused the error and the exceeded timeout period) if the executable is not completed at end of the timeout period.''', is_optional=True, ), 'executableFile': Field( String, description='''Required. Cloud Storage URI of executable file.''', is_optional=True, ), } ) ], description='''Optional. Commands to execute on each node after config is completed. By default, executables are run on master and all worker nodes. You can test a node\'s role metadata to run an executable on a master or worker node, as shown below using curl (you can also use wget): ROLE=$(curl -H Metadata-Flavor:Google http://metadata/computeMetadata/v1/instance/attributes/dataproc-role) if [[ "${ROLE}" == \'Master\' ]]; then ... master specific actions ... else ... worker specific actions ... fi ''', is_optional=True, ), 'configBucket': Field( String, description='''Optional. A Google Cloud Storage bucket used to stage job dependencies, config files, and job driver console output. If you do not specify a staging bucket, Cloud Dataproc will determine a Cloud Storage location (US, ASIA, or EU) for your cluster\'s staging bucket according to the Google Compute Engine zone where your cluster is deployed, and then create and manage this project-level, per-location bucket (see Cloud Dataproc staging bucket).''', is_optional=True, ), 'workerConfig': Field( Dict( fields={ 'accelerators': Field( List[ Dict( fields={ 'acceleratorCount': Field( Int, description='''The number of the accelerator cards of this type exposed to this instance.''', is_optional=True, ), 'acceleratorTypeUri': Field( String, description='''Full URL, partial URI, or short name of the accelerator type resource to expose to this instance. See Compute Engine AcceleratorTypes.Examples: https://www.googleapis.com/compute/beta/projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80 projects/[project_id]/zones/us-east1-a/acceleratorTypes/nvidia-tesla-k80 nvidia-tesla-k80Auto Zone Exception: If you are using the Cloud Dataproc Auto Zone Placement feature, you must use the short name of the accelerator type resource, for example, nvidia-tesla-k80.''', is_optional=True, ), } ) ], description='''Optional. The Compute Engine accelerator configuration for these instances.Beta Feature: This feature is still under development. It may be changed before final release.''', is_optional=True, ), 'numInstances': Field( Int, description='''Optional. The number of VM instances in the instance group. For master instance groups, must be set to 1.''', is_optional=True, ), 'diskConfig': Field( Dict( fields={ 'numLocalSsds': Field( Int, description='''Optional. Number of attached SSDs, from 0 to 4 (default is 0). If SSDs are not attached, the boot disk is used to store runtime logs and HDFS (https://hadoop.apache.org/docs/r1.2.1/hdfs_user_guide.html) data. If one or more SSDs are attached, this runtime bulk data is spread across them, and the boot disk contains only basic config and installed binaries.''', is_optional=True, ), 'bootDiskSizeGb': Field( Int, description='''Optional. Size in GB of the boot disk (default is 500GB).''', is_optional=True, ), 'bootDiskType': Field( String, description='''Optional. Type of the boot disk (default is "pd-standard"). Valid values: "pd-ssd" (Persistent Disk Solid State Drive) or "pd-standard" (Persistent Disk Hard Disk Drive).''', is_optional=True, ), } ), description='''Specifies the config of disk options for a group of VM instances.''', is_optional=True, ), 'managedGroupConfig': Field( Dict(fields={}), description='''Specifies the resources used to actively manage an instance group.''', is_optional=True, ), 'isPreemptible': Field( Bool, description='''Optional. Specifies that this instance group contains preemptible instances.''', is_optional=True, ), 'imageUri': Field( String, description='''Optional. The Compute Engine image resource used for cluster instances. It can be specified or may be inferred from SoftwareConfig.image_version.''', is_optional=True, ), 'machineTypeUri': Field( String, description='''Optional. The Compute Engine machine type used for cluster instances.A full URL, partial URI, or short name are valid. Examples: https://www.googleapis.com/compute/v1/projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2 projects/[project_id]/zones/us-east1-a/machineTypes/n1-standard-2 n1-standard-2Auto Zone Exception: If you are using the Cloud Dataproc Auto Zone Placement feature, you must use the short name of the machine type resource, for example, n1-standard-2.''', is_optional=True, ), } ), description='''Optional. The config settings for Compute Engine resources in an instance group, such as a master or worker group.''', is_optional=True, ), 'gceClusterConfig': Field( Dict( fields={ 'networkUri': Field( String, description='''Optional. The Compute Engine network to be used for machine communications. Cannot be specified with subnetwork_uri. If neither network_uri nor subnetwork_uri is specified, the "default" network of the project is used, if it exists. Cannot be a "Custom Subnet Network" (see Using Subnetworks for more information).A full URL, partial URI, or short name are valid. Examples: https://www.googleapis.com/compute/v1/projects/[project_id]/regions/global/default projects/[project_id]/regions/global/default default''', is_optional=True, ), 'zoneUri': Field( String, description='''Optional. The zone where the Compute Engine cluster will be located. On a create request, it is required in the "global" region. If omitted in a non-global Cloud Dataproc region, the service will pick a zone in the corresponding Compute Engine region. On a get request, zone will always be present.A full URL, partial URI, or short name are valid. Examples: https://www.googleapis.com/compute/v1/projects/[project_id]/zones/[zone] projects/[project_id]/zones/[zone] us-central1-f''', is_optional=True, ), 'metadata': Field( PermissiveDict(), description='''The Compute Engine metadata entries to add to all instances (see Project and instance metadata (https://cloud.google.com/compute/docs/storing-retrieving-metadata#project_and_instance_metadata)).''', is_optional=True, ), 'internalIpOnly': Field( Bool, description='''Optional. If true, all instances in the cluster will only have internal IP addresses. By default, clusters are not restricted to internal IP addresses, and will have ephemeral external IP addresses assigned to each instance. This internal_ip_only restriction can only be enabled for subnetwork enabled networks, and all off-cluster dependencies must be configured to be accessible without external IP addresses.''', is_optional=True, ), 'serviceAccountScopes': Field( List[String], description='''Optional. The URIs of service account scopes to be included in Compute Engine instances. The following base set of scopes is always included: https://www.googleapis.com/auth/cloud.useraccounts.readonly https://www.googleapis.com/auth/devstorage.read_write https://www.googleapis.com/auth/logging.writeIf no scopes are specified, the following defaults are also provided: https://www.googleapis.com/auth/bigquery https://www.googleapis.com/auth/bigtable.admin.table https://www.googleapis.com/auth/bigtable.data https://www.googleapis.com/auth/devstorage.full_control''', is_optional=True, ), 'tags': Field( List[String], description='''The Compute Engine tags to add to all instances (see Tagging instances).''', is_optional=True, ), 'serviceAccount': Field( String, description='''Optional. The service account of the instances. Defaults to the default Compute Engine service account. Custom service accounts need permissions equivalent to the following IAM roles: roles/logging.logWriter roles/storage.objectAdmin(see https://cloud.google.com/compute/docs/access/service-accounts#custom_service_accounts for more information). Example: [account_id]@[project_id].iam.gserviceaccount.com''', is_optional=True, ), 'subnetworkUri': Field( String, description='''Optional. The Compute Engine subnetwork to be used for machine communications. Cannot be specified with network_uri.A full URL, partial URI, or short name are valid. Examples: https://www.googleapis.com/compute/v1/projects/[project_id]/regions/us-east1/subnetworks/sub0 projects/[project_id]/regions/us-east1/subnetworks/sub0 sub0''', is_optional=True, ), } ), description='''Common config settings for resources of Compute Engine cluster instances, applicable to all instances in the cluster.''', is_optional=True, ), 'softwareConfig': Field( Dict( fields={ 'properties': Field( PermissiveDict(), description='''Optional. The properties to set on daemon config files.Property keys are specified in prefix:property format, for example core:hadoop.tmp.dir. The following are supported prefixes and their mappings: capacity-scheduler: capacity-scheduler.xml core: core-site.xml distcp: distcp-default.xml hdfs: hdfs-site.xml hive: hive-site.xml mapred: mapred-site.xml pig: pig.properties spark: spark-defaults.conf yarn: yarn-site.xmlFor more information, see Cluster properties.''', is_optional=True, ), 'optionalComponents': Field( List[Component], description='''The set of optional components to activate on the cluster.''', is_optional=True, ), 'imageVersion': Field( String, description='''Optional. The version of software inside the cluster. It must be one of the supported Cloud Dataproc Versions, such as "1.2" (including a subminor version, such as "1.2.29"), or the "preview" version. If unspecified, it defaults to the latest Debian version.''', is_optional=True, ), } ), description='''Specifies the selection and config of software inside the cluster.''', is_optional=True, ), } ), description='''The cluster config.''', is_optional=True, )
def define_emr_run_job_flow_config(): name = Field(String, description='The name of the job flow.', is_optional=False) log_uri = Field( String, description='''The location in Amazon S3 to write the log files of the job flow. If a value is not provided, logs are not created.''', is_optional=True, ) additional_info = Field( String, description='A JSON string for selecting additional features.', is_optional=True ) ami_version = Field( String, description='''Applies only to Amazon EMR AMI versions 3.x and 2.x. For Amazon EMR releases 4.0 and later, ReleaseLabel is used. To specify a custom AMI, use CustomAmiID.''', is_optional=True, ) release_label = Field( String, description='''The Amazon EMR release label, which determines the version of open-source application packages installed on the cluster. Release labels are in the form emr-x.x.x, where x.x.x is an Amazon EMR release version, for example, emr-5.14.0 . For more information about Amazon EMR release versions and included application versions and features, see https://docs.aws.amazon.com/emr/latest/ReleaseGuide/. The release label applies only to Amazon EMR releases versions 4.x and later. Earlier versions use AmiVersion.''', is_optional=True, ) instances = Field( Dict( fields={ 'MasterInstanceType': Field( String, description='The EC2 instance type of the master node.', is_optional=True, ), 'SlaveInstanceType': Field( String, description='The EC2 instance type of the core and task nodes.', is_optional=True, ), 'InstanceCount': Field( Int, description='The number of EC2 instances in the cluster.', is_optional=True ), 'InstanceGroups': _define_instance_groups(), 'InstanceFleets': _define_instance_fleets(), 'Ec2KeyName': Field( String, description='''The name of the EC2 key pair that can be used to ssh to the master node as the user called "hadoop."''', is_optional=True, ), 'Placement': Field( Dict( fields={ 'AvailabilityZone': Field( String, description='''The Amazon EC2 Availability Zone for the cluster. AvailabilityZone is used for uniform instance groups, while AvailabilityZones (plural) is used for instance fleets.''', is_optional=True, ), 'AvailabilityZones': Field( List[String], description='''When multiple Availability Zones are specified, Amazon EMR evaluates them and launches instances in the optimal Availability Zone. AvailabilityZones is used for instance fleets, while AvailabilityZone (singular) is used for uniform instance groups.''', is_optional=True, ), } ), description='The Availability Zone in which the cluster runs.', is_optional=True, ), 'KeepJobFlowAliveWhenNoSteps': Field( Bool, description='''Specifies whether the cluster should remain available after completing all steps.''', is_optional=True, ), 'TerminationProtected': Field( Bool, description='''Specifies whether to lock the cluster to prevent the Amazon EC2 instances from being terminated by API call, user intervention, or in the event of a job-flow error.''', is_optional=True, ), 'HadoopVersion': Field( String, description='''Applies only to Amazon EMR release versions earlier than 4.0. The Hadoop version for the cluster. Valid inputs are "0.18" (deprecated), "0.20" (deprecated), "0.20.205" (deprecated), "1.0.3", "2.2.0", or "2.4.0". If you do not set this value, the default of 0.18 is used, unless the AmiVersion parameter is set in the RunJobFlow call, in which case the default version of Hadoop for that AMI version is used.''', is_optional=True, ), 'Ec2SubnetId': Field( String, description='''Applies to clusters that use the uniform instance group configuration. To launch the cluster in Amazon Virtual Private Cloud (Amazon VPC), set this parameter to the identifier of the Amazon VPC subnet where you want the cluster to launch. If you do not specify this value, the cluster launches in the normal Amazon Web Services cloud, outside of an Amazon VPC, if the account launching the cluster supports EC2 Classic networks in the region where the cluster launches. Amazon VPC currently does not support cluster compute quadruple extra large (cc1.4xlarge) instances. Thus you cannot specify the cc1.4xlarge instance type for clusters launched in an Amazon VPC.''', is_optional=True, ), 'Ec2SubnetIds': Field( List[String], description='''Applies to clusters that use the instance fleet configuration. When multiple EC2 subnet IDs are specified, Amazon EMR evaluates them and launches instances in the optimal subnet.''', is_optional=True, ), 'EmrManagedMasterSecurityGroup': Field( String, description='''The identifier of the Amazon EC2 security group for the master node.''', is_optional=True, ), 'EmrManagedSlaveSecurityGroup': Field( String, description='''The identifier of the Amazon EC2 security group for the core and task nodes.''', is_optional=True, ), 'ServiceAccessSecurityGroup': Field( String, description='''The identifier of the Amazon EC2 security group for the Amazon EMR service to access clusters in VPC private subnets.''', is_optional=True, ), 'AdditionalMasterSecurityGroups': Field( List[String], description='''A list of additional Amazon EC2 security group IDs for the master node.''', is_optional=True, ), 'AdditionalSlaveSecurityGroups': Field( List[String], description='''A list of additional Amazon EC2 security group IDs for the core and task nodes.''', is_optional=True, ), } ), description='A specification of the number and type of Amazon EC2 instances.', is_optional=False, ) supported_products = Field( List[EmrSupportedProducts], description='''A list of strings that indicates third-party software to use. For more information, see the Amazon EMR Developer Guide. Currently supported values are: - "mapr-m3" - launch the job flow using MapR M3 Edition. - "mapr-m5" - launch the job flow using MapR M5 Edition. ''', is_optional=True, ) new_supported_products = Field( List[ Dict( fields={ 'Name': Field(String, is_optional=False), 'Args': Field(List[String], description='The list of user-supplied arguments.'), } ) ], description=''' The list of supported product configurations which allow user-supplied arguments. EMR accepts these arguments and forwards them to the corresponding installation script as bootstrap action arguments. A list of strings that indicates third-party software to use with the job flow that accepts a user argument list. EMR accepts and forwards the argument list to the corresponding installation script as bootstrap action arguments. For more information, see "Launch a Job Flow on the MapR Distribution for Hadoop" in the Amazon EMR Developer Guide. Supported values are: - "mapr-m3" - launch the cluster using MapR M3 Edition. - "mapr-m5" - launch the cluster using MapR M5 Edition. - "mapr" with the user arguments specifying "--edition,m3" or "--edition,m5" - launch the job flow using MapR M3 or M5 Edition respectively. - "mapr-m7" - launch the cluster using MapR M7 Edition. - "hunk" - launch the cluster with the Hunk Big Data Analtics Platform. - "hue"- launch the cluster with Hue installed. - "spark" - launch the cluster with Apache Spark installed. - "ganglia" - launch the cluster with the Ganglia Monitoring System installed.''', is_optional=True, ) applications = Field( List[ Dict( fields={ 'Name': Field( String, description='The name of the application.', is_optional=False ), 'Version': Field( String, description='The version of the application.', is_optional=True ), 'Args': Field( List[String], description='Arguments for Amazon EMR to pass to the application.', is_optional=True, ), 'AdditionalInfo': Field( PermissiveDict(), description='''This option is for advanced users only. This is meta information about third-party applications that third-party vendors use for testing purposes.''', is_optional=True, ), } ) ], description='''Applies to Amazon EMR releases 4.0 and later. A case-insensitive list of applications for Amazon EMR to install and configure when launching the cluster. For a list of applications available for each Amazon EMR release version, see the Amazon EMR Release Guide. With Amazon EMR release version 4.0 and later, the only accepted parameter is the application name. To pass arguments to applications, you use configuration classifications specified using configuration JSON objects. For more information, see the EMR Configuring Applications guide. With earlier Amazon EMR releases, the application is any Amazon or third-party software that you can add to the cluster. This structure contains a list of strings that indicates the software to use with the cluster and accepts a user argument list. Amazon EMR accepts and forwards the argument list to the corresponding installation script as bootstrap action argument.''', is_optional=True, ) visible_to_all_users = Field( Bool, description='''Whether the cluster is visible to all IAM users of the AWS account associated with the cluster. If this value is set to True, all IAM users of that AWS account can view and (if they have the proper policy permissions set) manage the cluster. If it is set to False, only the IAM user that created the cluster can view and manage it.''', is_optional=True, default_value=True, ) job_flow_role = Field( String, description='''Also called instance profile and EC2 role. An IAM role for an EMR cluster. The EC2 instances of the cluster assume this role. The default role is EMR_EC2_DefaultRole. In order to use the default role, you must have already created it using the CLI or console. ''', is_optional=True, ) service_role = Field( String, description='''The IAM role that will be assumed by the Amazon EMR service to access AWS resources on your behalf.''', is_optional=True, ) tags = Field( List[ Dict( fields={ 'Key': Field( String, description='''A user-defined key, which is the minimum required information for a valid tag. For more information, see the EMR Tag guide.''', is_optional=False, ), 'Value': Field( String, description='''A user-defined value, which is optional in a tag. For more information, see the EMR Tag Clusters guide.''', is_optional=True, ), } ) ], description='''A list of tags to associate with a cluster and propagate to Amazon EC2 instances. A key/value pair containing user-defined metadata that you can associate with an Amazon EMR resource. Tags make it easier to associate clusters in various ways, such as grouping clusters to track your Amazon EMR resource allocation costs. For more information, see the EMR Tag Clusters guide.''', is_optional=True, ) security_configuration = Field( String, description='The name of a security configuration to apply to the cluster.', is_optional=True, ) auto_scaling_role = Field( String, description='''An IAM role for automatic scaling policies. The default role is EMR_AutoScaling_DefaultRole. The IAM role provides permissions that the automatic scaling feature requires to launch and terminate EC2 instances in an instance group.''', is_optional=True, ) scale_down_behavior = Field( EmrScaleDownBehavior, description='''Specifies the way that individual Amazon EC2 instances terminate when an automatic scale-in activity occurs or an instance group is resized. TERMINATE_AT_INSTANCE_HOUR indicates that Amazon EMR terminates nodes at the instance-hour boundary, regardless of when the request to terminate the instance was submitted. This option is only available with Amazon EMR 5.1.0 and later and is the default for clusters created using that version. TERMINATE_AT_TASK_COMPLETION indicates that Amazon EMR blacklists and drains tasks from nodes before terminating the Amazon EC2 instances, regardless of the instance-hour boundary. With either behavior, Amazon EMR removes the least active nodes first and blocks instance termination if it could lead to HDFS corruption. TERMINATE_AT_TASK_COMPLETION available only in Amazon EMR version 4.1.0 and later, and is the default for versions of Amazon EMR earlier than 5.1.0.''', is_optional=True, ) custom_ami_id = Field( String, description='''Available only in Amazon EMR version 5.7.0 and later. The ID of a custom Amazon EBS-backed Linux AMI. If specified, Amazon EMR uses this AMI when it launches cluster EC2 instances. For more information about custom AMIs in Amazon EMR, see Using a Custom AMI in the Amazon EMR Management Guide. If omitted, the cluster uses the base Linux AMI for the ReleaseLabel specified. For Amazon EMR versions 2.x and 3.x, use AmiVersion instead. For information about creating a custom AMI, see Creating an Amazon EBS-Backed Linux AMI in the Amazon Elastic Compute Cloud User Guide for Linux Instances. For information about finding an AMI ID, see Finding a Linux AMI.''', is_optional=True, ) repo_upgrade_on_boot = Field( EmrRepoUpgradeOnBoot, description='''Applies only when CustomAmiID is used. Specifies which updates from the Amazon Linux AMI package repositories to apply automatically when the instance boots using the AMI. If omitted, the default is SECURITY , which indicates that only security updates are applied. If NONE is specified, no updates are applied, and all updates must be applied manually.''', is_optional=True, ) kerberos_attributes = Field( Dict( fields={ 'Realm': Field( String, description='''The name of the Kerberos realm to which all nodes in a cluster belong. For example, EC2.INTERNAL.''', is_optional=False, ), 'KdcAdminPassword': Field( String, description='''The password used within the cluster for the kadmin service on the cluster-dedicated KDC, which maintains Kerberos principals, password policies, and keytabs for the cluster.''', is_optional=False, ), 'CrossRealmTrustPrincipalPassword': Field( String, description='''Required only when establishing a cross-realm trust with a KDC in a different realm. The cross-realm principal password, which must be identical across realms.''', is_optional=True, ), 'ADDomainJoinUser': Field( String, description='''Required only when establishing a cross-realm trust with an Active Directory domain. A user with sufficient privileges to join resources to the domain.''', is_optional=True, ), 'ADDomainJoinPassword': Field( String, description='''The Active Directory password for ADDomainJoinUser.''', is_optional=True, ), } ), description='''Attributes for Kerberos configuration when Kerberos authentication is enabled using a security configuration. For more information see Use Kerberos Authentication in the EMR Management Guide .''', is_optional=True, ) return Field( Dict( fields={ 'Name': name, 'LogUri': log_uri, 'AdditionalInfo': additional_info, 'AmiVersion': ami_version, 'ReleaseLabel': release_label, 'Instances': instances, 'Steps': _define_steps(), 'BootstrapActions': _define_bootstrap_actions(), 'SupportedProducts': supported_products, 'NewSupportedProducts': new_supported_products, 'Applications': applications, 'Configurations': _define_configurations(), 'VisibleToAllUsers': visible_to_all_users, 'JobFlowRole': job_flow_role, 'ServiceRole': service_role, 'Tags': tags, 'SecurityConfiguration': security_configuration, 'AutoScalingRole': auto_scaling_role, 'ScaleDownBehavior': scale_down_behavior, 'CustomAmiId': custom_ami_id, 'EbsRootVolumeSize': Field( Int, description='''The size, in GiB, of the EBS root device volume of the Linux AMI that is used for each EC2 instance. Available in Amazon EMR version 4.x and later.''', is_optional=True, ), 'RepoUpgradeOnBoot': repo_upgrade_on_boot, 'KerberosAttributes': kerberos_attributes, } ), description='AWS EMR run job flow configuration', )
def test_construct_different_perm_dicts(): int_perm_dict = PermissiveDict(fields={'an_int': Field(int)}) string_perm_dict = PermissiveDict(fields={'a_string': Field(str)}) assert int_perm_dict is not string_perm_dict assert int_perm_dict.inst().key != string_perm_dict.inst().key
def test_construct_permissive_dict_same_same(): assert PermissiveDict() is PermissiveDict()
def test_kitchen_sink(): big_dict_1 = Dict({ 'field_one': Field(int, default_value=2, is_optional=True), 'field_two': Field( Dict({ 'nested_field_one': Field(bool), 'nested_selector': Field( Selector({ 'int_field_in_selector': Field(int), 'permissive_dict_in_selector': Field(PermissiveDict()), 'permissive_dict_with_fields_in_selector': Field(PermissiveDict({'string_field': Field(str)})), })), })), }) big_dict_2 = Dict({ 'field_one': Field(int, default_value=2, is_optional=True), 'field_two': Field( Dict( fields={ 'nested_field_one': Field(bool), 'nested_selector': Field( Selector( fields={ 'permissive_dict_in_selector': Field(PermissiveDict()), 'int_field_in_selector': Field(int), 'permissive_dict_with_fields_in_selector': Field( PermissiveDict( fields={'string_field': Field(str)})), })), })), }) assert big_dict_1 is big_dict_2 assert big_dict_1.inst().key == big_dict_2.inst().key # differs way down in tree big_dict_3 = Dict({ 'field_one': Field(int, default_value=2, is_optional=True), 'field_two': Field( Dict( fields={ 'nested_field_one': Field(bool), 'nested_selector': Field( Selector( fields={ 'permissive_dict_in_selector': Field(PermissiveDict()), 'int_field_in_selector': Field(int), 'permissive_dict_with_fields_in_selector': Field( PermissiveDict( fields={'int_field': Field(int)})), })), })), }) assert big_dict_1 is not big_dict_3 assert big_dict_1.inst().key != big_dict_3.inst().key
def bash_command_solid(bash_command, name=None, output_encoding=None): '''Execute a Bash command. ''' check.str_param(bash_command, 'bash_command') name = check.opt_str_param(name, 'name', default='bash_solid') output_encoding = check.opt_str_param(output_encoding, 'output_encoding', default='utf-8') @solid( name=name, config={ 'output_logging': Field( Enum( 'OutputType', [ EnumValue('STREAM', description='Stream script stdout/stderr.'), EnumValue( 'BUFFER', description= 'Buffer bash script stdout/stderr, then log upon completion.', ), EnumValue('NONE', description='No logging'), ], ), is_optional=True, default_value='STREAM', ), 'env': Field( PermissiveDict(), description= 'Environment variables to pass to the child process; if not provided, ' 'the current process environment will be passed.', is_optional=True, default_value=None, ), }, ) def _bash_solid(context): '''This logic is ported from the Airflow BashOperator implementation. https://github.com/apache/airflow/blob/master/airflow/operators/bash_operator.py ''' def log_info_msg(log_str): context.log.info('[bash][{name}] '.format(name=name) + log_str) tmp_path = seven.get_system_temp_directory() log_info_msg('using temporary directory: %s' % tmp_path) env = (context.solid_config['env'] if context.solid_config['env'] is not None else os.environ.copy()) with NamedTemporaryFile(dir=tmp_path, prefix=name) as tmp_file: tmp_file.write(bytes(bash_command.encode('utf-8'))) tmp_file.flush() script_location = os.path.abspath(tmp_file.name) log_info_msg('Temporary script location: {location}'.format( location=script_location)) def pre_exec(): # Restore default signal disposition and invoke setsid for sig in ('SIGPIPE', 'SIGXFZ', 'SIGXFSZ'): if hasattr(signal, sig): signal.signal(getattr(signal, sig), signal.SIG_DFL) os.setsid() log_info_msg( 'Running command: {command}'.format(command=bash_command)) # pylint: disable=subprocess-popen-preexec-fn sub_process = Popen( ['bash', tmp_file.name], stdout=PIPE, stderr=STDOUT, cwd=tmp_path, env=env, preexec_fn=pre_exec, ) # Stream back logs as they are emitted if context.solid_config['output_logging'] == 'STREAM': line = '' for raw_line in iter(sub_process.stdout.readline, b''): line = raw_line.decode(output_encoding).rstrip() log_info_msg(line) sub_process.wait() # Collect and buffer all logs, then emit if context.solid_config['output_logging'] == 'BUFFER': line = '' for raw_line in iter(sub_process.stdout.readline, b''): line += raw_line.decode(output_encoding) log_info_msg(line) # no logging in this case elif context.solid_config['output_logging'] == 'NONE': pass log_info_msg('Command exited with return code {retcode}'.format( retcode=sub_process.returncode)) if sub_process.returncode: raise Failure( description='[bash][{name}] Bash command failed'.format( name=name)) return line return _bash_solid
def put_object_configs(): return Field( Dict( fields={ 'ACL': Field(S3ACL, description='The canned ACL to apply to the object.', is_optional=True), # Body will be set by the solid, not supplied in config 'Bucket': Field( String, description= 'Name of the bucket to which the PUT operation was initiated.', is_optional=False, ), 'CacheControl': Field( String, description= 'Specifies caching behavior along the request/reply chain.', is_optional=True, ), 'ContentDisposition': Field( String, description= 'Specifies presentational information for the object.', is_optional=True, ), 'ContentEncoding': Field( String, description= '''Specifies what content encodings have been applied to the object and thus what decoding mechanisms must be applied to obtain the media-type referenced by the Content-Type header field.''', is_optional=True, ), 'ContentLanguage': Field(String, description='The language the content is in.', is_optional=True), 'ContentLength': Field( Int, description= '''Size of the body in bytes. This parameter is useful when the size of the body cannot be determined automatically.''', is_optional=True, ), 'ContentMD5': Field( String, description= '''The base64-encoded 128-bit MD5 digest of the part data. This parameter is auto-populated when using the command from the CLI''', is_optional=True, ), 'ContentType': Field( String, description= 'A standard MIME type describing the format of the object data.', is_optional=True, ), # TODO: datetime object # # 'Expires': Field(datetime, description='The date and time at which the object is # no longer cacheable.', is_optional=True), 'GrantFullControl': Field( String, description= '''Gives the grantee READ, READ_ACP, and WRITE_ACP permissions on the object.''', is_optional=True, ), 'GrantRead': Field( String, description= 'Allows grantee to read the object data and its metadata.', is_optional=True, ), 'GrantReadACP': Field(String, description='Allows grantee to read the object ACL.', is_optional=True), 'GrantWriteACP': Field( String, description= 'Allows grantee to write the ACL for the applicable object.', is_optional=True, ), 'Key': Field( String, description= 'Object key for which the PUT operation was initiated.', is_optional=False, ), 'Metadata': Field( PermissiveDict(), description= 'A map of metadata to store with the object in S3.', is_optional=True, ), 'ServerSideEncryption': Field( String, description= '''The Server-side encryption algorithm used when storing this object in S3 (e.g., AES256, aws:kms).''', is_optional=True, ), 'StorageClass': Field( String, description= '''The type of storage to use for the object. Defaults to 'STANDARD'.''', is_optional=True, ), 'WebsiteRedirectLocation': Field( String, description= '''If the bucket is configured as a website, redirects requests for this object to another object in the same bucket or to an external URL. Amazon S3 stores the value of this header in the object metadata.''', is_optional=True, ), 'SSECustomerAlgorithm': Field( String, description= '''Specifies the algorithm to use to when encrypting the object (e.g., AES256).''', is_optional=True, ), 'SSECustomerKey': Field( String, description= '''Specifies the customer-provided encryption key for Amazon S3 to use in encrypting data. This value is used to store the object and then it is discarded; Amazon does not store the encryption key. The key must be appropriate for use with the algorithm specified in the x-amz-server-side-encryption-customer-algorithm header.''', is_optional=True, ), 'SSECustomerKeyMD5': Field( String, description= '''Specifies the 128-bit MD5 digest of the encryption key according to RFC 1321. Amazon S3 uses this header for a message integrity check to ensure the encryption key was transmitted without error. Please note that this parameter is automatically populated if it is not provided. Including this parameter is not required''', is_optional=True, ), 'SSEKMSKeyId': Field( String, description= '''Specifies the AWS KMS key ID to use for object encryption. All GET and PUT requests for an object protected by AWS KMS will fail if not made via SSL or using SigV4. Documentation on configuring any of the officially supported AWS SDKs and CLI can be found at http://docs.aws.amazon.com/AmazonS3/latest/dev/UsingAWSSDK.html#specify-signature-version''', is_optional=True, ), 'RequestPayer': Field( String, description= '''Confirms that the requester knows that she or he will be charged for the request. Bucket owners need not specify this parameter in their requests. Documentation on downloading objects from requester pays buckets can be found at http://docs.aws.amazon.com/AmazonS3/latest/dev/ObjectsinRequesterPaysBuckets.html''', is_optional=True, ), 'Tagging': Field( String, description= '''The tag-set for the object. The tag-set must be encoded as URL Query parameters. (For example, "Key1=Value1")''', is_optional=True, ), 'ObjectLockMode': Field( String, description= 'The Object Lock mode that you want to apply to this object.', is_optional=True, ), # TODO: datetime object 'ObjectLockRetainUntilDate': Field(datetime, # description='The date and time when you want this object\'s Object Lock to # expire.', is_optional=True), 'ObjectLockLegalHoldStatus': Field( String, description= '''The Legal Hold status that you want to apply to the specified object.''', is_optional=True, ), }))
'\'localhost\')}//\'.'), ), 'backend': Field( String, is_optional=True, default_value='rpc://', description= 'The URL of the Celery results backend. Default: \'rpc://\'.', ), 'include': Field(List[String], is_optional=True, description='List of modules every worker should import'), 'config_source': Field(PermissiveDict(), is_optional=True, description='Settings for the Celery app.'), }, ) def celery_executor(init_context): '''Celery-based executor. The Celery executor exposes config settings for the underlying Celery app under the ``config_source`` key. This config corresponds to the "new lowercase settings" introduced in Celery version 4.0 and the object constructed from config will be passed to the :py:class:`celery.Celery` constructor as its ``config_source`` argument. (See https://docs.celeryproject.org/en/latest/userguide/configuration.html for details.) The executor also exposes the ``broker``, `backend`, and ``include`` arguments to the :py:class:`celery.Celery` constructor.