Ejemplo n.º 1
0
def main():
    arguments = parse_cmd()
    sdk = yandexcloud.SDK(token=arguments.token)

    fill_missing_flags(sdk, arguments)

    resources = common_pb.Resources(
        resource_preset_id='s2.micro',
        disk_size=15 * (1024**3),
        disk_type_id='network-ssd',
    )
    req = create_cluster_request(arguments, resources=resources)
    cluster_id = None
    try:
        cluster = create_cluster(sdk, req)
        cluster_id = cluster.id
        change_cluster_description(sdk, cluster_id)
        add_subcluster(sdk, cluster_id, arguments, resources=resources)

        run_hive_job(sdk, cluster_id=cluster_id)
        run_mapreduce_job(sdk,
                          cluster_id=cluster_id,
                          bucket=arguments.s3_bucket)
        run_spark_job(sdk, cluster_id=cluster_id, bucket=arguments.s3_bucket)
        run_pyspark_job(sdk, cluster_id=cluster_id, bucket=arguments.s3_bucket)
    finally:
        if cluster_id is not None:
            delete_cluster(sdk, cluster_id)
Ejemplo n.º 2
0
def main():
    logging.basicConfig(level=logging.INFO)
    arguments = parse_cmd()
    if arguments.token:
        sdk = yandexcloud.SDK(token=arguments.token, user_agent=USER_AGENT)
    else:
        with open(arguments.sa_json_path) as infile:
            sdk = yandexcloud.SDK(service_account_key=json.load(infile),
                                  user_agent=USER_AGENT)

    fill_missing_arguments(sdk, arguments)

    resources = common_pb.Resources(
        resource_preset_id='s2.micro',
        disk_type_id='network-ssd',
    )
    cluster_id = None
    try:
        operation_result = create_cluster(
            sdk, create_cluster_request(arguments, resources=resources))
        cluster_id = operation_result.response.id
        change_cluster_description(sdk, cluster_id)
        add_subcluster(sdk, cluster_id, arguments, resources=resources)

        run_hive_job(sdk, cluster_id=cluster_id)
        run_mapreduce_job(sdk,
                          cluster_id=cluster_id,
                          bucket=arguments.s3_bucket)
        run_spark_job(sdk, cluster_id=cluster_id, bucket=arguments.s3_bucket)
        run_pyspark_job(sdk, cluster_id=cluster_id, bucket=arguments.s3_bucket)
    finally:
        if cluster_id is not None:
            delete_cluster(sdk, cluster_id)
Ejemplo n.º 3
0
    def create_cluster(
        self,
        s3_bucket,
        folder_id=None,
        cluster_name=None,
        cluster_description='',
        cluster_image_version='1.1',
        ssh_public_keys=None,
        subnet_id=None,
        services=('HDFS', 'YARN', 'MAPREDUCE', 'HIVE', 'SPARK'),
        zone='ru-central1-b',
        service_account_id=None,
        masternode_resource_preset='s2.small',
        masternode_disk_size=15,
        masternode_disk_type='network-ssd',
        datanode_resource_preset='s2.small',
        datanode_disk_size=15,
        datanode_disk_type='network-ssd',
        datanode_count=2,
        computenode_resource_preset='s2.small',
        computenode_disk_size=15,
        computenode_disk_type='network-ssd',
        computenode_count=0,
    ):
        """
        Create Yandex.Cloud Data Proc cluster.

        :param s3_bucket: Yandex.Cloud S3 bucket to store cluster logs.
                          Jobs will not work if the bicket is not specified.
        :type s3_bucket: str
        :param folder_id: ID of the folder in which cluster should be created.
        :type folder_id: str
        :param cluster_name: Cluster name. Must be unique inside the folder.
        :type folder_id: str
        :param cluster_description: Cluster description.
        :type cluster_description: str
        :param cluster_image_version: Cluster image version. Use default.
        :type cluster_image_version: str
        :param ssh_public_keys: List of SSH public keys that will be deployed to created compute instances.
        :type ssh_public_keys: List[str]
        :param subnet_id: ID of the subnetwork. All Data Proc cluster nodes will use one subnetwork.
        :type subnet_id: str
        :param services: List of services that will be installed to the cluster. Possible options:
            HDFS, YARN, MAPREDUCE, HIVE, TEZ, ZOOKEEPER, HBASE, SQOOP, FLUME, SPARK, SPARK, ZEPPELIN, OOZIE
        :type services: List[str]
        :param zone: Availability zone to create cluster in.
                     Currently there are ru-central1-a, ru-central1-b and ru-central1-c.
        :type zone: str
        :param service_account_id: Service account id for the cluster.
                                   Service account can be created inside the folder.
        :type service_account_id: str
        :param masternode_resource_preset: Resources preset (CPU+RAM configuration)
                                           for the master node of the cluster.
        :type masternode_resource_preset: str
        :param masternode_disk_size: Masternode storage size in GiB.
        :type masternode_disk_size: int
        :param masternode_disk_type: Masternode storage type. Possible options: network-ssd, network-hdd.
        :type masternode_disk_type: str
        :param datanode_resource_preset: Resources preset (CPU+RAM configuration)
                                         for the data nodes of the cluster.
        :type datanode_resource_preset: str
        :param datanode_disk_size: Datanodes storage size in GiB.
        :type datanode_disk_size: int
        :param datanode_disk_type: Datanodes storage type. Possible options: network-ssd, network-hdd.
        :type datanode_disk_type: str
        :param computenode_resource_preset: Resources preset (CPU+RAM configuration)
                                            for the compute nodes of the cluster.
        :type computenode_resource_preset: str
        :param computenode_disk_size: Computenodes storage size in GiB.
        :type computenode_disk_size: int
        :param computenode_disk_type: Computenodes storage type. Possible options: network-ssd, network-hdd.
        :type computenode_disk_type: str

        :return: Cluster ID
        :rtype: str
        """

        # pylint: disable=too-many-arguments
        # pylint: disable=too-many-locals

        folder_id = folder_id or self.default_folder_id
        if not folder_id:
            raise RuntimeError(
                'Folder ID must be specified to create cluster.')

        if not cluster_name:
            random_int = random.randint(0, 999)
            cluster_name = 'dataproc-{random_int}'.format(
                random_int=random_int)

        if not subnet_id:
            network_id = self.sdk.helpers.find_network_id(folder_id)
            subnet_id = self.sdk.helpers.find_subnet_id(
                folder_id, zone, network_id)

        if not service_account_id:
            service_account_id = self.sdk.helpers.find_service_account_id(
                folder_id)

        if not ssh_public_keys:
            if self.default_public_ssh_key:
                ssh_public_keys = (self.default_public_ssh_key, )
            else:
                raise RuntimeError('Public ssh keys must be specified.')
        elif isinstance(ssh_public_keys, string_types):
            ssh_public_keys = [ssh_public_keys]

        subclusters = [
            cluster_service_pb.CreateSubclusterConfigSpec(
                name='master',
                role=subcluster_pb.Role.MASTERNODE,
                resources=common_pb.Resources(
                    resource_preset_id=masternode_resource_preset,
                    disk_size=masternode_disk_size * (1024**3),
                    disk_type_id=masternode_disk_type,
                ),
                subnet_id=subnet_id,
                hosts_count=1,
            ),
            cluster_service_pb.CreateSubclusterConfigSpec(
                name='data',
                role=subcluster_pb.Role.DATANODE,
                resources=common_pb.Resources(
                    resource_preset_id=datanode_resource_preset,
                    disk_size=datanode_disk_size * (1024**3),
                    disk_type_id=datanode_disk_type,
                ),
                subnet_id=subnet_id,
                hosts_count=datanode_count,
            ),
        ]

        if computenode_count:
            subclusters.append(
                cluster_service_pb.CreateSubclusterConfigSpec(
                    name='compute',
                    role=subcluster_pb.Role.DATANODE,
                    resources=common_pb.Resources(
                        resource_preset_id=computenode_resource_preset,
                        disk_size=computenode_disk_size * (1024**3),
                        disk_type_id=computenode_disk_type,
                    ),
                    subnet_id=subnet_id,
                    hosts_count=computenode_count,
                ))

        request = cluster_service_pb.CreateClusterRequest(
            folder_id=folder_id,
            name=cluster_name,
            description=cluster_description,
            config_spec=cluster_service_pb.CreateClusterConfigSpec(
                version_id=cluster_image_version,
                hadoop=cluster_pb.HadoopConfig(
                    services=services,
                    ssh_public_keys=ssh_public_keys,
                ),
                subclusters_spec=subclusters,
            ),
            zone_id=zone,
            service_account_id=service_account_id,
            bucket=s3_bucket,
        )
        result = self.sdk.create_operation_and_get_result(
            request,
            service=cluster_service_grpc_pb.ClusterServiceStub,
            method_name='Create',
            response_type=cluster_pb.Cluster,
            meta_type=cluster_service_pb.CreateClusterMetadata,
        )
        self.cluster_id = result.response.id
        self.subnet_id = subnet_id
        return result
Ejemplo n.º 4
0
    def create_subcluster(
        self,
        subcluster_type,
        name,
        resource_preset='s2.small',
        disk_size=15,
        disk_type='network-ssd',
        hosts_count=5,
        subnet_id=None,
        cluster_id=None,
    ):
        """
        Create subcluster to Yandex.Cloud Data Proc cluster.

        :param name: Name of the subcluster. Must be unique in the cluster
        :type name: str
        :param subcluster_type: Type of the subcluster. Either "data" or "compute".
        :type subcluster_type: str
        :param resource_preset: Resources preset (CPU+RAM configuration) for the nodes of the cluster.
        :type resource_preset: str
        :param disk_size: Storage size in GiB.
        :type disk_size: int
        :param disk_type: Storage type. Possible options: network-ssd, network-hdd.
        :type disk_type: str
        :param hosts_count: Number of nodes in subcluster.
        :type hosts_count: int
        :param subnet_id: Subnet ID of the cluster.
        :type subnet_id: str
        :param cluster_id: ID of the cluster.
        :type cluster_id: str
        """
        cluster_id = cluster_id or self.cluster_id
        if not cluster_id:
            raise RuntimeError('Cluster id must be specified.')
        subnet_id = subnet_id or self.subnet_id
        if not subnet_id:
            raise RuntimeError('Subnet ID id must be specified.')

        types = {
            'compute': subcluster_pb.Role.COMPUTENODE,
            'data': subcluster_pb.Role.DATANODE,
        }
        resources = common_pb.Resources(
            resource_preset_id=resource_preset,
            disk_size=disk_size * (1024**3),
            disk_type_id=disk_type,
        )

        self.log.info('Adding subcluster to cluster {cluster_id}'.format(
            cluster_id=cluster_id))
        request = subcluster_service_pb.CreateSubclusterRequest(
            cluster_id=cluster_id,
            name=name,
            role=types[subcluster_type],
            resources=resources,
            subnet_id=subnet_id,
            hosts_count=hosts_count,
        )
        return self.sdk.create_operation_and_get_result(
            request,
            service=subcluster_service_grpc_pb.SubclusterServiceStub,
            method_name='Create',
            response_type=subcluster_pb.Subcluster,
            meta_type=subcluster_service_pb.CreateSubclusterMetadata,
        )
Ejemplo n.º 5
0
    def create_cluster(
        self,
        s3_bucket,
        folder_id=None,
        cluster_name=None,
        cluster_description="",
        cluster_image_version=None,
        ssh_public_keys=None,
        subnet_id=None,
        services=None,
        zone="ru-central1-b",
        service_account_id=None,
        masternode_resource_preset=None,
        masternode_disk_size=None,
        masternode_disk_type=None,
        datanode_resource_preset=None,
        datanode_disk_size=None,
        datanode_disk_type=None,
        datanode_count=None,
        computenode_resource_preset=None,
        computenode_disk_size=None,
        computenode_disk_type=None,
        computenode_count=None,
        computenode_max_hosts_count=None,
        computenode_measurement_duration=None,
        computenode_warmup_duration=None,
        computenode_stabilization_duration=None,
        computenode_preemptible=None,
        computenode_cpu_utilization_target=None,
        computenode_decommission_timeout=None,
        log_group_id=None,
    ):
        """
        Create Yandex.Cloud Data Proc cluster.

        :param s3_bucket: Yandex.Cloud S3 bucket to store cluster logs.
                          Jobs will not work if the bicket is not specified.
        :type s3_bucket: str
        :param folder_id: ID of the folder in which cluster should be created.
        :type folder_id: str
        :param cluster_name: Cluster name. Must be unique inside the folder.
        :type folder_id: str
        :param cluster_description: Cluster description.
        :type cluster_description: str
        :param cluster_image_version: Cluster image version. Use default.
        :type cluster_image_version: str
        :param ssh_public_keys: List of SSH public keys that will be deployed to created compute instances.
        :type ssh_public_keys: List[str]
        :param subnet_id: ID of the subnetwork. All Data Proc cluster nodes will use one subnetwork.
        :type subnet_id: str
        :param services: List of services that will be installed to the cluster. Possible options:
            HDFS, YARN, MAPREDUCE, HIVE, TEZ, ZOOKEEPER, HBASE, SQOOP, FLUME, SPARK, SPARK, ZEPPELIN, OOZIE
        :type services: List[str]
        :param zone: Availability zone to create cluster in.
                     Currently there are ru-central1-a, ru-central1-b and ru-central1-c.
        :type zone: str
        :param service_account_id: Service account id for the cluster.
                                   Service account can be created inside the folder.
        :type service_account_id: str
        :param masternode_resource_preset: Resources preset (CPU+RAM configuration)
                                           for the master node of the cluster.
        :type masternode_resource_preset: str
        :param masternode_disk_size: Masternode storage size in GiB.
        :type masternode_disk_size: int
        :param masternode_disk_type: Masternode storage type. Possible options: network-ssd, network-hdd.
        :type masternode_disk_type: str
        :param datanode_resource_preset: Resources preset (CPU+RAM configuration)
                                         for the data nodes of the cluster.
        :type datanode_resource_preset: str
        :param datanode_disk_size: Datanodes storage size in GiB.
        :type datanode_disk_size: int
        :param datanode_disk_type: Datanodes storage type. Possible options: network-ssd, network-hdd.
        :type datanode_disk_type: str
        :param computenode_resource_preset: Resources preset (CPU+RAM configuration)
                                            for the compute nodes of the cluster.
        :type datanode_count: int
        :param datanode_count: Number of data nodes.
        :type computenode_resource_preset: str
        :param computenode_disk_size: Computenodes storage size in GiB.
        :type computenode_disk_size: int
        :param computenode_disk_type: Computenodes storage type. Possible options: network-ssd, network-hdd.
        :type computenode_disk_type: str
        :type computenode_count: int
        :param computenode_count: Number of compute nodes.
        :type computenode_max_count: int
        :param computenode_max_count: Maximum number of nodes of compute autoscaling subcluster.
        :param computenode_warmup_duration: The warmup time of the instance in seconds. During this time,
                                traffic is sent to the instance, but instance metrics are not collected. In seconds.
        :type computenode_warmup_duration: int
        :param computenode_stabilization_duration: Minimum amount of time in seconds allotted for monitoring before
                                       Instance Groups can reduce the number of instances in the group.
                                       During this time, the group size doesn't decrease, even if the new metric values
                                       indicate that it should. In seconds.
        :type computenode_stabilization_duration: int
        :param computenode_preemptible: Preemptible instances are stopped at least once every 24 hours,
                            and can be stopped at any time if their resources are needed by Compute.
        :type computenode_preemptible: bool
        :param computenode_cpu_utilization_target: Defines an autoscaling rule
                                       based on the average CPU utilization of the instance group.
                                       in percents. 10-100.
        :type computenode_cpu_utilization_target: int
        :param computenode_decommission_timeout: Timeout to gracefully decommission nodes during downscaling.
                                                 In seconds.
        :type computenode_decommission_timeout: int
        :param log_group_id: Id of log group to write logs. By default logs will be sent to default log group.
                             To disable cloud log sending set cluster property dataproc:disable_cloud_logging = true
        :type log_group_id: str

        :return: Cluster ID
        :rtype: str
        """

        # pylint: disable=too-many-arguments
        # pylint: disable=too-many-locals
        # pylint: disable=too-many-branches

        folder_id = folder_id or self.default_folder_id
        if not folder_id:
            raise RuntimeError(
                "Folder ID must be specified to create cluster.")

        if not cluster_name:
            random_int = random.randint(0, 999)
            cluster_name = "dataproc-{random_int}".format(
                random_int=random_int)

        if not subnet_id:
            network_id = self.sdk.helpers.find_network_id(folder_id)
            subnet_id = self.sdk.helpers.find_subnet_id(
                folder_id, zone, network_id)

        if not service_account_id:
            service_account_id = self.sdk.helpers.find_service_account_id(
                folder_id)

        if not ssh_public_keys:
            if self.default_public_ssh_key:
                ssh_public_keys = (self.default_public_ssh_key, )
            else:
                raise RuntimeError("Public ssh keys must be specified.")
        elif isinstance(ssh_public_keys, string_types):
            ssh_public_keys = [ssh_public_keys]

        if not s3_bucket:
            raise RuntimeError("Object storage (S3) bucket must be specified.")

        gib = 1024**3
        if masternode_disk_size:
            masternode_disk_size *= gib
        if datanode_disk_size:
            datanode_disk_size *= gib
        if computenode_disk_size:
            computenode_disk_size *= gib
        subclusters = [
            cluster_service_pb.CreateSubclusterConfigSpec(
                name="master",
                role=subcluster_pb.Role.MASTERNODE,
                resources=common_pb.Resources(
                    resource_preset_id=masternode_resource_preset,
                    disk_size=masternode_disk_size,
                    disk_type_id=masternode_disk_type,
                ),
                subnet_id=subnet_id,
                hosts_count=1,
            ),
            cluster_service_pb.CreateSubclusterConfigSpec(
                name="data",
                role=subcluster_pb.Role.DATANODE,
                resources=common_pb.Resources(
                    resource_preset_id=datanode_resource_preset,
                    disk_size=datanode_disk_size,
                    disk_type_id=datanode_disk_type,
                ),
                subnet_id=subnet_id,
                hosts_count=datanode_count,
            ),
        ]

        if computenode_count:
            autoscaling_config = None
            if computenode_max_hosts_count:
                autoscaling_config = subcluster_pb.AutoscalingConfig(
                    max_hosts_count=computenode_max_hosts_count,
                    measurement_duration=computenode_measurement_duration,
                    warmup_duration=computenode_warmup_duration,
                    stabilization_duration=computenode_stabilization_duration,
                    preemptible=computenode_preemptible,
                    cpu_utilization_target=computenode_cpu_utilization_target,
                    decommission_timeout=computenode_decommission_timeout,
                )
            subclusters.append(
                cluster_service_pb.CreateSubclusterConfigSpec(
                    name="compute",
                    role=subcluster_pb.Role.COMPUTENODE,
                    resources=common_pb.Resources(
                        resource_preset_id=computenode_resource_preset,
                        disk_size=computenode_disk_size,
                        disk_type_id=computenode_disk_type,
                    ),
                    subnet_id=subnet_id,
                    hosts_count=computenode_count,
                    autoscaling_config=autoscaling_config,
                ))

        request = cluster_service_pb.CreateClusterRequest(
            folder_id=folder_id,
            name=cluster_name,
            description=cluster_description,
            config_spec=cluster_service_pb.CreateClusterConfigSpec(
                version_id=cluster_image_version,
                hadoop=cluster_pb.HadoopConfig(
                    services=services,
                    ssh_public_keys=ssh_public_keys,
                ),
                subclusters_spec=subclusters,
            ),
            zone_id=zone,
            service_account_id=service_account_id,
            bucket=s3_bucket,
            log_group_id=log_group_id,
        )
        result = self.sdk.create_operation_and_get_result(
            request,
            service=cluster_service_grpc_pb.ClusterServiceStub,
            method_name="Create",
            response_type=cluster_pb.Cluster,
            meta_type=cluster_service_pb.CreateClusterMetadata,
        )
        self.cluster_id = result.response.id
        self.subnet_id = subnet_id  # pylint: disable=attribute-defined-outside-init
        return result
Ejemplo n.º 6
0
    def create_subcluster(
        self,
        subcluster_type,
        name,
        resource_preset=None,
        disk_size=None,
        disk_type=None,
        hosts_count=None,
        subnet_id=None,
        cluster_id=None,
        max_hosts_count=None,
        measurement_duration=None,
        warmup_duration=None,
        stabilization_duration=None,
        preemptible=None,
        cpu_utilization_target=None,
        decommission_timeout=None,
    ):
        """
        Create subcluster to Yandex.Cloud Data Proc cluster.

        :param name: Name of the subcluster. Must be unique in the cluster
        :type name: str
        :param subcluster_type: Type of the subcluster. Either "data" or "compute".
        :type subcluster_type: str
        :param resource_preset: Resources preset (CPU+RAM configuration) for the nodes of the cluster.
        :type resource_preset: str
        :param disk_size: Storage size in GiB.
        :type disk_size: int
        :param disk_type: Storage type. Possible options: network-ssd, network-hdd.
        :type disk_type: str
        :param hosts_count: Number of nodes in subcluster.
        :type hosts_count: int
        :param subnet_id: Subnet ID of the cluster.
        :type subnet_id: str
        :param cluster_id: ID of the cluster.
        :type cluster_id: str
        :param max_hosts_count: Upper limit for total instance autoscaling compute subcluster count.
        :type max_hosts_count: int
        :param measurement_duration: Time in seconds allotted for averaging metrics. In seconds.
        :type measurement_duration: int
        :param warmup_duration: The warmup time of the instance in seconds. During this time,
                                traffic is sent to the instance, but instance metrics are not collected. In seconds.
        :type warmup_duration: int
        :param stabilization_duration: Minimum amount of time in seconds allotted for monitoring before
                                       Instance Groups can reduce the number of instances in the group.
                                       During this time, the group size doesn't decrease, even if the new metric values
                                       indicate that it should. In seconds.
        :type stabilization_duration: int
        :param preemptible: Preemptible instances are stopped at least once every 24 hours,
                            and can be stopped at any time if their resources are needed by Compute.
        :type preemptible: bool
        :param cpu_utilization_target: Defines an autoscaling rule
                                       based on the average CPU utilization of the instance group.
                                       in percents. 10-100.
        :type cpu_utilization_target: int
        :param decommission_timeout: Timeout to gracefully decommission nodes during downscaling. In seconds.
        :type decommission_timeout: int
        """
        # pylint: disable=too-many-locals
        cluster_id = cluster_id or self.cluster_id
        if not cluster_id:
            raise RuntimeError("Cluster id must be specified.")
        subnet_id = subnet_id or self.subnet_id
        if not subnet_id:
            raise RuntimeError("Subnet ID id must be specified.")
        subnet_id = subnet_id or self.subnet_id
        if not subnet_id:
            raise RuntimeError("Subnet ID id must be specified.")

        types = {
            "compute": subcluster_pb.Role.COMPUTENODE,
            "data": subcluster_pb.Role.DATANODE,
        }
        if disk_size:
            disk_size *= 1024**3
        resources = common_pb.Resources(
            resource_preset_id=resource_preset,
            disk_size=disk_size,
            disk_type_id=disk_type,
        )

        self.log.info("Adding subcluster to cluster {cluster_id}".format(
            cluster_id=cluster_id))
        autoscaling_config = None
        if max_hosts_count:
            autoscaling_config = subcluster_pb.AutoscalingConfig(
                max_hosts_count=max_hosts_count,
                measurement_duration=measurement_duration,
                warmup_duration=warmup_duration,
                stabilization_duration=stabilization_duration,
                preemptible=preemptible,
                cpu_utilization_target=cpu_utilization_target,
                decommission_timeout=decommission_timeout,
            )
        request = subcluster_service_pb.CreateSubclusterRequest(
            cluster_id=cluster_id,
            name=name,
            role=types[subcluster_type],
            resources=resources,
            subnet_id=subnet_id,
            hosts_count=hosts_count,
            autoscaling_config=autoscaling_config,
        )
        return self.sdk.create_operation_and_get_result(
            request,
            service=subcluster_service_grpc_pb.SubclusterServiceStub,
            method_name="Create",
            response_type=subcluster_pb.Subcluster,
            meta_type=subcluster_service_pb.CreateSubclusterMetadata,
        )