def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_screts()) if spark_client.stop_job_app(args.job_id, args.app_name): log.info("Stopped app {0}".format(args.app_name)) else: log.error("App with name {0} does not exist or was already deleted")
def add_file(self, p): """ Adds the specified file to the import queue. """ log = self._app.log # Sanity checks if not p.exists(): raise IOError((errno.ENOENT, "Cannot add file: does not exist.", p)) if not p.isfile(): raise IOError((errno.ENOTDIR, "Cannot add file: not a file.", p)) if self.rootpath.relpathto(p).startswith(".."): raise ValueError("The supplied file is not a child of the document root.") if p.ext not in FILETYPES: log.debug("Skipping file %s. Filetype not supported." % (p)) return if self.has_been_imported(p): # TODO check datestamp to see if it needs to be updated log.debug("Skipping file %s. Already processed.") return log.info("Queueing file for import: '%s'" % (p)) parser = DocumentFileParser(p, self.rootpath, log) parser.parse() self.import_queue.append(parser)
def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_screts()) cluster_conf = ClusterConfiguration() cluster_conf.spark_configuration = load_aztk_spark_config() # read cluster.yaml configuartion file, overwrite values with args file_config, wait = config.read_cluster_config() cluster_conf.merge(file_config) cluster_conf.merge( ClusterConfiguration(cluster_id=args.cluster_id, vm_count=args.size, vm_low_pri_count=args.size_low_pri, vm_size=args.vm_size, subnet_id=args.subnet_id, user_configuration=UserConfiguration( username=args.username, password=args.password, ), docker_repo=args.docker_repo)) wait = wait if args.wait is None else args.wait user_configuration = cluster_conf.user_configuration if user_configuration and user_configuration.username: ssh_key, password = utils.get_ssh_key_or_prompt( spark_client.secrets_config.ssh_pub_key, user_configuration.username, user_configuration.password, spark_client.secrets_config) cluster_conf.user_configuration = aztk.spark.models.UserConfiguration( username=user_configuration.username, password=password, ssh_key=ssh_key) else: cluster_conf.user_configuration = None print_cluster_conf(cluster_conf, wait) spinner = utils.Spinner() spinner.start() # create spark cluster cluster = spark_client.create_cluster(cluster_conf, wait=wait) spinner.stop() if wait: log.info("Cluster %s created successfully.", cluster.id) else: log.info("Cluster %s is being provisioned.", cluster.id)
def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_screts()) cluster_id = args.cluster_id if not args.force: confirmation_cluster_id = input( "Please confirm the id of the cluster you wish to delete: ") if confirmation_cluster_id != cluster_id: log.error( "Confirmation cluster id does not match. Please try again.") return if spark_client.delete_cluster(cluster_id): log.info("Deleting cluster %s", cluster_id) else: log.error("Cluster with id '%s' doesn't exist or was already deleted.", cluster_id)
def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_screts()) job_id = args.job_id if not args.force: # check if job exists before prompting for confirmation spark_client.get_job(job_id) confirmation_cluster_id = input( "Please confirm the id of the cluster you wish to delete: ") if confirmation_cluster_id != job_id: log.error( "Confirmation cluster id does not match. Please try again.") return if spark_client.delete_job(job_id): log.info("Deleting Job %s", job_id) else: log.error("Job with id '%s' doesn't exist or was already deleted.", job_id)
def execute(args: typing.NamedTuple): spark_client = load_spark_client() jars = [] py_files = [] files = [] if args.jars is not None: jars = args.jars.replace(' ', '').split(',') if args.py_files is not None: py_files = args.py_files.replace(' ', '').split(',') if args.files is not None: files = args.files.replace(' ', '').split(',') log.info("-------------------------------------------") log.info("Spark cluster id: %s", args.cluster_id) log.info("Spark app name: %s", args.name) log.info("Wait for app completion: %s", args.wait) if args.main_class is not None: log.info("Entry point class: %s", args.main_class) if jars: log.info("JARS: %s", jars) if py_files: log.info("PY_Files: %s", py_files) if files: log.info("Files: %s", files) if args.driver_java_options is not None: log.info("Driver java options: %s", args.driver_java_options) if args.driver_library_path is not None: log.info("Driver library path: %s", args.driver_library_path) if args.driver_class_path is not None: log.info("Driver class path: %s", args.driver_class_path) if args.driver_memory is not None: log.info("Driver memory: %s", args.driver_memory) if args.executor_memory is not None: log.info("Executor memory: %s", args.executor_memory) if args.driver_cores is not None: log.info("Driver cores: %s", args.driver_cores) if args.executor_cores is not None: log.info("Executor cores: %s", args.executor_cores) log.info("Application: %s", args.app) log.info("Application arguments: %s", args.app_args) log.info("-------------------------------------------") spark_client.submit(cluster_id=args.cluster_id, application=aztk.spark.models.Application( name=args.name, application=args.app, application_args=args.app_args, main_class=args.main_class, jars=jars, py_files=py_files, files=files, driver_java_options=args.driver_java_options, driver_library_path=args.driver_library_path, driver_class_path=args.driver_class_path, driver_memory=args.driver_memory, executor_memory=args.executor_memory, driver_cores=args.driver_cores, executor_cores=args.executor_cores, max_retry_count=args.max_retry_count), wait=False) if args.wait: utils.stream_logs(client=spark_client, cluster_id=args.cluster_id, application_name=args.name)
def print_cluster_conf(cluster_conf: ClusterConfiguration, wait: bool): user_configuration = cluster_conf.user_configuration log.info("-------------------------------------------") log.info("spark cluster id: %s", cluster_conf.cluster_id) log.info("spark cluster size: %s", cluster_conf.vm_count + cluster_conf.vm_low_pri_count) log.info("> dedicated: %s", cluster_conf.vm_count) log.info("> low priority: %s", cluster_conf.vm_low_pri_count) log.info("spark cluster vm size: %s", cluster_conf.vm_size) log.info( "custom scripts: %s", len(cluster_conf.custom_scripts) if cluster_conf.custom_scripts else 0) log.info("subnet ID: %s", cluster_conf.subnet_id) log.info( "file shares: %s", len(cluster_conf.file_shares) if cluster_conf.file_shares is not None else 0) log.info("docker repo name: %s", cluster_conf.docker_repo) log.info("wait for cluster: %s", wait) log.info("username: %s", user_configuration.username) if user_configuration.password: log.info("Password: %s", '*' * len(user_configuration.password)) log.info("-------------------------------------------")
def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_screts()) # read cluster.yaml configuartion file, overwrite values with args cluster_conf = ClusterConfig() cluster_conf.merge(uid=args.cluster_id, size=args.size, size_low_pri=args.size_low_pri, vm_size=args.vm_size, subnet_id=args.subnet_id, wait=args.wait, username=args.username, password=args.password, docker_repo=args.docker_repo) if cluster_conf.custom_scripts: custom_scripts = [] for custom_script in cluster_conf.custom_scripts: custom_scripts.append( aztk.spark.models.CustomScript(script=custom_script['script'], run_on=custom_script['runOn'])) else: custom_scripts = None if cluster_conf.file_shares: file_shares = [] for file_share in cluster_conf.file_shares: file_shares.append( aztk.spark.models.FileShare( storage_account_name=file_share['storage_account_name'], storage_account_key=file_share['storage_account_key'], file_share_path=file_share['file_share_path'], mount_path=file_share['mount_path'])) else: file_shares = None if cluster_conf.username: ssh_key, password = utils.get_ssh_key_or_prompt( spark_client.secrets_config.ssh_pub_key, cluster_conf.username, cluster_conf.password, spark_client.secrets_config) user_conf = aztk.spark.models.UserConfiguration( username=cluster_conf.username, password=password, ssh_key=ssh_key) else: user_conf = None print_cluster_conf(cluster_conf) spinner = utils.Spinner() spinner.start() # create spark cluster cluster = spark_client.create_cluster( aztk.spark.models.ClusterConfiguration( cluster_id=cluster_conf.uid, vm_count=cluster_conf.size, vm_low_pri_count=cluster_conf.size_low_pri, vm_size=cluster_conf.vm_size, subnet_id=cluster_conf.subnet_id, custom_scripts=custom_scripts, file_shares=file_shares, docker_repo=cluster_conf.docker_repo, spark_configuration=load_aztk_spark_config(), user_configuration=user_conf), wait=cluster_conf.wait) spinner.stop() if cluster_conf.wait: log.info("Cluster %s created successfully.", cluster.id) else: log.info("Cluster %s is being provisioned.", cluster.id)
def print_cluster_conf(cluster_conf): log.info("-------------------------------------------") log.info("spark cluster id: %s", cluster_conf.uid) log.info("spark cluster size: %s", cluster_conf.size + cluster_conf.size_low_pri) log.info("> dedicated: %s", cluster_conf.size) log.info("> low priority: %s", cluster_conf.size_low_pri) log.info("spark cluster vm size: %s", cluster_conf.vm_size) log.info("custom scripts: %s", cluster_conf.custom_scripts) log.info("subnet ID: %s", cluster_conf.subnet_id) log.info( "file shares: %s", len(cluster_conf.file_shares) if cluster_conf.file_shares is not None else 0) log.info("docker repo name: %s", cluster_conf.docker_repo) log.info("wait for cluster: %s", cluster_conf.wait) log.info("username: %s", cluster_conf.username) if cluster_conf.password: log.info("Password: %s", '*' * len(cluster_conf.password)) log.info("-------------------------------------------")
def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_screts()) log.info('-------------------------------------------') log.info('spark cluster id: {}'.format(args.cluster_id)) log.info('username: {}'.format(args.username)) log.info('-------------------------------------------') if args.ssh_key: ssh_key = args.ssh_key else: ssh_key = spark_client.secrets_config.ssh_pub_key ssh_key, password = utils.get_ssh_key_or_prompt(ssh_key, args.username, args.password, spark_client.secrets_config) spark_client.create_user( cluster_id=args.cluster_id, username=args.username, password=password, ssh_key=ssh_key ) if password: log.info('password: %s', '*' * len(password)) elif ssh_key: log.info('ssh public key: %s', ssh_key) log.info('-------------------------------------------')
def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_screts()) ssh_conf = SshConfig() ssh_conf.merge(cluster_id=args.cluster_id, username=args.username, job_ui_port=args.jobui, job_history_ui_port=args.jobhistoryui, web_ui_port=args.webui, jupyter_port=args.jupyter, name_node_ui_port=args.namenodeui, rstudio_server_port=args.rstudioserver, host=args.host, connect=args.connect) http_prefix = 'http://localhost:' log.info("-------------------------------------------") log.info("spark cluster id: %s", ssh_conf.cluster_id) log.info("open webui: %s%s", http_prefix, ssh_conf.web_ui_port) log.info("open jobui: %s%s", http_prefix, ssh_conf.job_ui_port) log.info("open jobhistoryui: %s%s", http_prefix, ssh_conf.job_history_ui_port) log.info("open jupyter: %s%s", http_prefix, ssh_conf.jupyter_port) log.info("open namenodeui: %s%s", http_prefix, ssh_conf.name_node_ui_port) log.info("open rstudio server: %s%s", http_prefix, ssh_conf.rstudio_server_port) log.info("ssh username: %s", ssh_conf.username) log.info("connect: %s", ssh_conf.connect) log.info("-------------------------------------------") # get ssh command try: ssh_cmd = utils.ssh_in_master( client=spark_client, cluster_id=ssh_conf.cluster_id, webui=ssh_conf.web_ui_port, jobui=ssh_conf.job_ui_port, jobhistoryui=ssh_conf.job_history_ui_port, namenodeui=ssh_conf.name_node_ui_port, jupyter=ssh_conf.jupyter_port, rstudioserver=ssh_conf.rstudio_server_port, username=ssh_conf.username, host=ssh_conf.host, connect=ssh_conf.connect) if not ssh_conf.connect: log.info("") log.info( "Use the following command to connect to your spark head node:" ) log.info("\t%s", ssh_cmd) except batch_error.BatchErrorException as e: if e.error.code == "PoolNotFound": raise aztk.error.AztkError( "The cluster you are trying to connect to does not exist.") else: raise
def execute(args: typing.NamedTuple): spark_client = load_spark_client() # read cluster.yaml configuartion file, overwrite values with args cluster_conf = ClusterConfig() cluster_conf.merge(uid=args.cluster_id, size=args.size, size_low_pri=args.size_low_pri, vm_size=args.vm_size, wait=args.wait, username=args.username, password=args.password, docker_repo=args.docker_repo) print_cluster_conf(cluster_conf) if cluster_conf.custom_scripts: custom_scripts = [] for custom_script in cluster_conf.custom_scripts: custom_scripts.append( aztk.spark.models.CustomScript(script=custom_script['script'], run_on=custom_script['runOn'])) else: custom_scripts = None if cluster_conf.file_shares: file_shares = [] for file_share in cluster_conf.file_shares: file_shares.append( aztk.spark.models.FileShare( storage_account_name=file_share['storage_account_name'], storage_account_key=file_share['storage_account_key'], file_share_path=file_share['file_share_path'], mount_path=file_share['mount_path'])) else: file_shares = None jars_src = aztk.utils.constants.DEFAULT_SPARK_JARS_SOURCE # create spark cluster cluster = spark_client.create_cluster( aztk.spark.models.ClusterConfiguration( cluster_id=cluster_conf.uid, vm_count=cluster_conf.size, vm_low_pri_count=cluster_conf.size_low_pri, vm_size=cluster_conf.vm_size, custom_scripts=custom_scripts, file_shares=file_shares, docker_repo=cluster_conf.docker_repo, spark_configuration=aztk.spark.models.SparkConfiguration( spark_defaults_conf=os.path.join( aztk.utils.constants.DEFAULT_SPARK_CONF_SOURCE, 'spark-defaults.conf'), spark_env_sh=os.path.join( aztk.utils.constants.DEFAULT_SPARK_CONF_SOURCE, 'spark-env.sh'), core_site_xml=os.path.join( aztk.utils.constants.DEFAULT_SPARK_CONF_SOURCE, 'core-site.xml'), jars=[ os.path.join(jars_src, path) for path in os.listdir(jars_src) ])), wait=cluster_conf.wait) if cluster_conf.username: ssh_key = spark_client.secrets_config.ssh_pub_key ssh_key, password = utils.get_ssh_key_or_prompt( ssh_key, cluster_conf.username, cluster_conf.password, spark_client.secrets_config) spark_client.create_user(cluster_id=cluster_conf.uid, username=cluster_conf.username, password=password, ssh_key=ssh_key) if cluster_conf.wait: log.info("Cluster %s created successfully.", cluster.id) else: log.info("Cluster %s is being provisioned.", cluster.id)