def submit_classify_job(job_config, cluster_id, dry_run, **kwargs): job_configuration = "config/classify_job.config" if job_config is not None and job_config.strip() != "": job_configuration = job_config.strip() config = configparser.ConfigParser() config.optionxform = str config.read(job_configuration) if cluster_id is None or cluster_id.strip() == "": cluster_id = utility.get_cluster_id(dry_run) else: cluster_id = cluster_id.strip() if cluster_id != "" and check_configuration(config): if config["job_config"].get("upload_classify_script", "False") == "True": utility.upload_files_to_s3([(config["job_config"]["classify_script"], config["job_config"]["classify_script_local_location"], config["job_config"]["classify_script_s3_location"])], dry_run) num_executors = calculate_num_executor(cluster_id, config["spark_config"]["executor_memory"]) if num_executors < 0: config["spark_config"]["num_executors"] = "None" else: config["spark_config"]["num_executors"] = str(num_executors) config["spark_config"]["executor_cores"] = "1" job_argument = build_command(cluster_id, config, num_executors) if not dry_run: emr_client = boto3.client("emr") # warn user before removing any output out = config["script_arguments"]["output_location"] # find out which output dirs, if any, exist dirs_to_remove = utility.check_s3_path_exists([out]) # create a list of the names of the directories to remove if dirs_to_remove: response = input("About to remove any existing output directories." + "\n\n\t{}\n\nProceed? [y/n]: ".format( '\n\n\t'.join(dirs_to_remove))) while response not in ['y', 'n']: response = input('Proceed? [y/n]: ') if response == 'n': print("Program Terminated. Modify config file to change " + "output directories.") sys.exit(0) # remove the output directories if not utility.remove_s3_files(dirs_to_remove): print("Program terminated") sys.exit(1) job_submission = emr_client.add_job_flow_steps(**job_argument) print("Submitted job to cluster {}. Job id is {}".format(cluster_id, job_submission["StepIds"][0])) else: print(job_argument)
if cluster_id != "" and check_configuration(config): if config["job_config"].get("upload_downloader_script", "False") == "True": utility.upload_files_to_s3([(config["job_config"]["downloader_script"], config["job_config"]["downloader_script_local_location"], config["job_config"]["downloader_script_s3_location"])], parser_result.dry_run) job_argument = build_command(config) if not parser_result.dry_run: emr_client = boto3.client("emr") # warn user before removing any output out = config["script_arguments"]["output_location"] rep = config["script_arguments"]["report_location"] # find out which output dirs, if any, exist dirs_to_remove = utility.check_s3_path_exists([out, rep]) if dirs_to_remove: response = input("About to remove any existing output directories." + "\n\n\t{}\n\nProceed? [y/n]: ".format( '\n\n\t'.join(dirs_to_remove))) while response not in ['y', 'n']: response = input('Proceed? [y/n]: ') if response == 'n': print("Program Terminated. Modify config file to change " + "output directories.") sys.exit(0) # remove the output directories if not utility.remove_s3_files(dirs_to_remove): print("Program terminated") sys.exit(1) job_submission = emr_client.add_job_flow_steps(**job_argument)
num_executors = calculate_num_executor(cluster_id, config["spark_config"]["executor_memory"]) if num_executors < 0: config["spark_config"]["num_executors"] = "None" else: config["spark_config"]["num_executors"] = str(num_executors) config["spark_config"]["executor_cores"] = "1" job_argument = build_command(config) if not parser_result.dry_run: emr_client = boto3.client("emr") # warn user before removing any output out = config["script_arguments"]["output_location"] # find out which output dirs, if any, exist dirs_to_remove = utility.check_s3_path_exists([out]) # create a list of the names of the directories to remove if dirs_to_remove: response = input("About to remove any existing output directories." + "\n\n\t{}\n\nProceed? [y/n]: ".format( '\n\n\t'.join(dirs_to_remove))) while response not in ['y', 'n']: response = input('Proceed? [y/n]: ') if response == 'n': print("Program Terminated. Modify config file to change " + "output directories.") sys.exit(0) # remove the output directories if not utility.remove_s3_files(dirs_to_remove): print("Program terminated") sys.exit(1)