def submit_classify_job(job_config, cluster_id, dry_run, **kwargs): job_configuration = "config/classify_job.config" if job_config is not None and job_config.strip() != "": job_configuration = job_config.strip() config = configparser.ConfigParser() config.optionxform = str config.read(job_configuration) if cluster_id is None or cluster_id.strip() == "": cluster_id = utility.get_cluster_id(dry_run) else: cluster_id = cluster_id.strip() if cluster_id != "" and check_configuration(config): if config["job_config"].get("upload_classify_script", "False") == "True": utility.upload_files_to_s3([(config["job_config"]["classify_script"], config["job_config"]["classify_script_local_location"], config["job_config"]["classify_script_s3_location"])], dry_run) num_executors = calculate_num_executor(cluster_id, config["spark_config"]["executor_memory"]) if num_executors < 0: config["spark_config"]["num_executors"] = "None" else: config["spark_config"]["num_executors"] = str(num_executors) config["spark_config"]["executor_cores"] = "1" job_argument = build_command(cluster_id, config, num_executors) if not dry_run: emr_client = boto3.client("emr") # warn user before removing any output out = config["script_arguments"]["output_location"] # find out which output dirs, if any, exist dirs_to_remove = utility.check_s3_path_exists([out]) # create a list of the names of the directories to remove if dirs_to_remove: response = input("About to remove any existing output directories." + "\n\n\t{}\n\nProceed? [y/n]: ".format( '\n\n\t'.join(dirs_to_remove))) while response not in ['y', 'n']: response = input('Proceed? [y/n]: ') if response == 'n': print("Program Terminated. Modify config file to change " + "output directories.") sys.exit(0) # remove the output directories if not utility.remove_s3_files(dirs_to_remove): print("Program terminated") sys.exit(1) job_submission = emr_client.add_job_flow_steps(**job_argument) print("Submitted job to cluster {}. Job id is {}".format(cluster_id, job_submission["StepIds"][0])) else: print(job_argument)
def upload_files_to_s3(cfg, dry_run): """ uploads files to aws s3 storage - and updates the configuration object with the details of the s3 files :param cfg: ConfigParser configuration object :param dry_run: flag to indicate if this is "dry run" or not :return: the configuration object """ s3_upload_list = [] section = "job_config" if cfg[section]["upload_script"] == "True": s3_upload_list.append((cfg[section]["script"], cfg[section]["script_local_location"], cfg[section]["script_s3_location"])) section = "user_script_config" if cfg[section]["upload_user_files"] == "True": # upload the compulsory user script s3_upload_list.append((cfg[section]["script"], cfg[section]["user_files_local_location"], cfg[section]["user_files_s3_location"])) # upload any optional user files if "supporting_files" in cfg[section]: for f in cfg[section]["supporting_files"].split(','): if f.strip() != "": s3_upload_list.append((f.strip(), cfg[section]["user_files_local_location"], cfg[section]["user_files_s3_location"])) # call utility code to upload list of files to s3 files = utility.upload_files_to_s3(s3_upload_list, dry_run) cfg["s3"] = {"files": files} return cfg
if parser_result.job_config is not None and parser_result.job_config.strip() != "": job_configuration = parser_result.job_config.strip() config = configparser.ConfigParser() config.optionxform = str config.read(job_configuration) if parser_result.cluster_id is None or parser_result.cluster_id.strip() == "": cluster_id = utility.get_cluster_id(parser_result.dry_run) else: cluster_id = parser_result.cluster_id.strip() if cluster_id != "" and check_configuration(config): if config["job_config"].get("upload_downloader_script", "False") == "True": utility.upload_files_to_s3([(config["job_config"]["downloader_script"], config["job_config"]["downloader_script_local_location"], config["job_config"]["downloader_script_s3_location"])], parser_result.dry_run) job_argument = build_command(config) if not parser_result.dry_run: emr_client = boto3.client("emr") # warn user before removing any output out = config["script_arguments"]["output_location"] rep = config["script_arguments"]["report_location"] # find out which output dirs, if any, exist dirs_to_remove = utility.check_s3_path_exists([out, rep]) if dirs_to_remove: response = input("About to remove any existing output directories." + "\n\n\t{}\n\nProceed? [y/n]: ".format( '\n\n\t'.join(dirs_to_remove)))
if parser_result.job_config is not None and parser_result.job_config.strip() != "": job_configuration = parser_result.job_config.strip() config = configparser.ConfigParser() config.optionxform = str config.read(job_configuration) if parser_result.cluster_id is None or parser_result.cluster_id.strip() == "": cluster_id = utility.get_cluster_id(parser_result.dry_run) else: cluster_id = parser_result.cluster_id.strip() if cluster_id != "" and check_configuration(config): if config["job_config"].get("upload_analysis_script", "False") == "True": utility.upload_files_to_s3([(config["job_config"]["analysis_script"], config["job_config"]["analysis_script_local_location"], config["job_config"]["analysis_script_s3_location"])], parser_result.dry_run) num_executors = calculate_num_executor(cluster_id, config["spark_config"]["executor_memory"]) if num_executors < 0: config["spark_config"]["num_executors"] = "None" else: config["spark_config"]["num_executors"] = str(num_executors) config["spark_config"]["executor_cores"] = "1" job_argument = build_command(config) if not parser_result.dry_run: emr_client = boto3.client("emr") # warn user before removing any output
action="store_true", dest="dry_run", help="Produce the configurations for the cluster to be created") parser_result = parser.parse_args() if parser_result.emr_config and parser_result.emr_config.strip() != "": emr_configuration = parser_result.emr_config config = configparser.ConfigParser() config.read(emr_configuration) if check_configuration(config): if config["EMR"].get("upload_bootstrap_scripts", "False") == "True": utility.upload_files_to_s3( [(bootstrap_script.strip(), config["EMR"]["bootstrap_scripts_local_location"], config["EMR"]["bootstrap_scripts_s3_location"]) for bootstrap_script in config["EMR"] ["bootstrap_scripts"].split(",")], parser_result.dry_run) emr_argument = build_command(config) if not parser_result.dry_run: emr_client = boto3.client("emr") cluster_launch = emr_client.run_job_flow(**emr_argument) print("Cluster has been launched with ID", cluster_launch["JobFlowId"]) else: print("\n".join([ "{} = {}".format(*emr_arg) for emr_arg in list(emr_argument.items()) ]))
if __name__ == "__main__": parser = argparse.ArgumentParser(description='Cluster launcher for spark-based RNA-seq Pipeline') parser.add_argument('--config', '-c', action="store", dest="emr_config", help="EMR configuration file") parser.add_argument('--dry-run', '-d', action="store_true", dest="dry_run", help="Produce the configurations for the cluster to be created") parser_result = parser.parse_args() if parser_result.emr_config and parser_result.emr_config.strip() != "": emr_configuration = parser_result.emr_config config = configparser.ConfigParser() config.read(emr_configuration) if check_configuration(config): if config["EMR"].get("upload_bootstrap_scripts", "False") == "True": utility.upload_files_to_s3( [(bootstrap_script.strip(), config["EMR"]["bootstrap_scripts_local_location"], config["EMR"]["bootstrap_scripts_s3_location"]) for bootstrap_script in config["EMR"]["bootstrap_scripts"].split(",")], parser_result.dry_run) emr_argument = build_command(config) if not parser_result.dry_run: emr_client = boto3.client("emr") cluster_launch = emr_client.run_job_flow(**emr_argument) print("Cluster has been launched with ID", cluster_launch["JobFlowId"]) else: print("\n".join(["{} = {}".format(*emr_arg) for emr_arg in list(emr_argument.items())]))