def main(args): # parser command-line args usage = '%prog [options]' description = "Collect EMR stats from active jobflows. " description += "Active jobflows are those in states of: " description += "BOOTSTRAPPING, RUNNING, STARTING, and WAITING. " description += "Collected stats include total number of active jobflows" description += "and total number of Amazon EC2 instances used to execute" description += "these jobflows. The instance counts are not separated by" description += "instance type." option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( "-p", "--pretty-print", action="store_true", dest="pretty_print", default=False, help=('Pretty print the collected stats')) add_basic_opts(option_parser) options, args = option_parser.parse_args(args) if args: option_parser.error('takes no arguments') MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) log.info('collecting EMR active jobflows...') job_flows = collect_active_job_flows(options.conf_paths) log.info('compiling stats from collected jobflows...') stats = job_flows_to_stats(job_flows) if options.pretty_print: pretty_print(stats) else: print(json.dumps(stats))
def make_option_parser(): usage = '%prog [options] <time-untouched> <URIs>' description = ( 'Delete all files in a given URI that are older than a specified' ' time.\n\nThe time parameter defines the threshold for removing' ' files. If the file has not been accessed for *time*, the file is' ' removed. The time argument is a number with an optional' ' single-character suffix specifying the units: m for minutes, h for' ' hours, d for days. If no suffix is specified, time is in hours.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") add_basic_opts(option_parser) scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ('aws_region', 's3_endpoint'), }) alphabetize_options(option_parser) return option_parser
def make_option_parser(): usage = "%prog [options] <time-untouched> <URIs>" description = ( "Delete all files in a given URI that are older than a specified" " time.\n\nThe time parameter defines the threshold for removing" " files. If the file has not been accessed for *time*, the file is" " removed. The time argument is a number with an optional" " single-character suffix specifying the units: m for minutes, h for" " hours, d for days. If no suffix is specified, time is in hours." ) option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( "-t", "--test", dest="test", default=False, action="store_true", help="Don't actually delete any files; just log that we would", ) add_basic_opts(option_parser) return option_parser
def make_option_parser(): usage = '%prog [options]' description = ('Terminate idle EMR job flows that meet the criteria' ' passed in on the command line (or, by default,' ' job flows that have been idle for one hour).') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '--max-hours-idle', dest='max_hours_idle', default=None, type='float', help=('Max number of hours a job flow can go without bootstrapping,' ' running a step, or having a new step created. This will fire' ' even if there are pending steps which EMR has failed to' ' start. Make sure you set this higher than the amount of time' ' your jobs can take to start instances and bootstrap.')) option_parser.add_option( '--max-mins-locked', dest='max_mins_locked', default=DEFAULT_MAX_MINUTES_LOCKED, type='float', help='Max number of minutes a job flow can be locked while idle.') option_parser.add_option( '--mins-to-end-of-hour', dest='mins_to_end_of_hour', default=None, type='float', help=('Terminate job flows that are within this many minutes of' ' the end of a full hour since the job started running' ' AND have no pending steps.')) option_parser.add_option( '--unpooled-only', dest='unpooled_only', action='store_true', default=False, help='Only terminate un-pooled job flows') option_parser.add_option( '--pooled-only', dest='pooled_only', action='store_true', default=False, help='Only terminate pooled job flows') option_parser.add_option( '--pool-name', dest='pool_name', default=None, help='Only terminate job flows in the given named pool.') option_parser.add_option( '--dry-run', dest='dry_run', default=False, action='store_true', help="Don't actually kill idle jobs; just log that we would") option_parser.add_option( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") add_basic_opts(option_parser) add_emr_connect_opts(option_parser) alphabetize_options(option_parser) return option_parser
def make_option_parser(): usage = '%prog [options]' description = ('Terminate idle EMR clusters that meet the criteria' ' passed in on the command line (or, by default,' ' clusters that have been idle for one hour).') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '--max-hours-idle', dest='max_hours_idle', default=None, type='float', help=('Max number of hours a cluster can go without bootstrapping,' ' running a step, or having a new step created. This will fire' ' even if there are pending steps which EMR has failed to' ' start. Make sure you set this higher than the amount of time' ' your jobs can take to start instances and bootstrap.')) option_parser.add_option( '--max-mins-locked', dest='max_mins_locked', default=DEFAULT_MAX_MINUTES_LOCKED, type='float', help='Max number of minutes a cluster can be locked while idle.') option_parser.add_option( '--mins-to-end-of-hour', dest='mins_to_end_of_hour', default=None, type='float', help=('Terminate clusters that are within this many minutes of' ' the end of a full hour since the job started running' ' AND have no pending steps.')) option_parser.add_option( '--unpooled-only', dest='unpooled_only', action='store_true', default=False, help='Only terminate un-pooled clusters') option_parser.add_option( '--pooled-only', dest='pooled_only', action='store_true', default=False, help='Only terminate pooled clusters') option_parser.add_option( '--pool-name', dest='pool_name', default=None, help='Only terminate clusters in the given named pool.') option_parser.add_option( '--dry-run', dest='dry_run', default=False, action='store_true', help="Don't actually kill idle jobs; just log that we would") option_parser.add_option( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") add_basic_opts(option_parser) add_emr_connect_opts(option_parser) alphabetize_options(option_parser) return option_parser
def make_option_parser(): usage = '%prog [options] jobflowid' description = 'Terminate an existing EMR job flow.' option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") add_basic_opts(option_parser) return option_parser
def make_option_parser(): usage = '%prog [options]' description = 'Print a giant report on EMR usage.' option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '--max-days-ago', dest='max_days_ago', type='float', default=None, help=('Max number of days ago to look at jobs. By default, we go back' ' as far as EMR supports (currently about 2 months)')) add_basic_opts(option_parser) return option_parser
def make_option_parser(): usage = 'usage: %prog [options] JOB_FLOW_ID' description = ( 'List, display, and parse Hadoop logs associated with EMR job flows.' ' Useful for debugging failed jobs for which mrjob did not display a' ' useful error message or for inspecting jobs whose output has been' ' lost.') option_parser = OptionParser(usage=usage, description=description) add_basic_opts(option_parser) option_parser.add_option('-f', '--find-failure', dest='find_failure', action='store_true', default=False, help=('Search the logs for information about why' ' the job failed')) option_parser.add_option('-l', '--list', dest='list_relevant', action="store_true", default=False, help='List log files MRJob finds relevant') option_parser.add_option('-L', '--list-all', dest='list_all', action="store_true", default=False, help='List all log files') option_parser.add_option('-a', '--cat', dest='cat_relevant', action="store_true", default=False, help='Cat log files MRJob finds relevant') option_parser.add_option('-A', '--cat-all', dest='cat_all', action="store_true", default=False, help='Cat all log files to JOB_FLOW_ID/') option_parser.add_option('-s', '--step-num', dest='step_num', action='store', type='int', default=None, help=('Limit results to a single step. To be used' ' with --list and --cat.')) option_parser.add_option('--counters', dest='get_counters', action='store_true', default=False, help='Show counters from the job flow') add_emr_connect_opts(option_parser) scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ('ec2_key_pair_file', 's3_sync_wait_time', 'ssh_bin') }) alphabetize_options(option_parser) return option_parser
def make_option_parser(): usage = '%prog [options] cluster-id' description = 'Terminate an existing EMR cluster.' option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '-t', '--test', dest='test', default=False, action='store_true', help="Don't actually delete any files; just log that we would") add_basic_opts(option_parser) add_emr_connect_opts(option_parser) alphabetize_options(option_parser) return option_parser
def make_option_parser(): usage = '%prog [options]' description = ('Report jobs running for more than a certain number of' ' hours (by default, %.1f). This can help catch buggy jobs' ' and Hadoop/EMR operational issues.' % DEFAULT_MIN_HOURS) option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( '--min-hours', dest='min_hours', type='float', default=DEFAULT_MIN_HOURS, help=('Minimum number of hours a job can run before we report it.' ' Default: %default')) add_basic_opts(option_parser) return option_parser
def make_option_parser(): usage = "%prog [options]" description = "Print a giant report on EMR usage." option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( "--max-days-ago", dest="max_days_ago", type="float", default=None, help=( "Max number of days ago to look at jobs. By default, we go back" " as far as EMR supports (currently about 2 months)" ), ) add_basic_opts(option_parser) return option_parser
def main(cl_args=None): usage = 'usage: %prog CLUSTER_ID OUTPUT_DIR [options] "command string"' description = ('Run a command on the master and all slaves of an EMR' ' cluster. Store stdout/stderr for results in OUTPUT_DIR.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " CLUSTER_ID)") add_basic_opts(option_parser) add_emr_connect_opts(option_parser) scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ('ec2_key_pair_file', 'ssh_bin'), }) alphabetize_options(option_parser) options, args = option_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] if len(args) < 2: option_parser.print_help() sys.exit(1) cluster_id, cmd_string = args[:2] cmd_args = shlex_split(cmd_string) output_dir = os.path.abspath(options.output_dir or cluster_id) with EMRJobRunner(cluster_id=cluster_id, **runner_kwargs) as runner: runner._enable_slave_ssh_access() run_on_all_nodes(runner, output_dir, cmd_args)
def make_option_parser(): usage = '%prog [options]' description = ( 'Create a persistent EMR job flow to run jobs in, and print its ID to' ' stdout. WARNING: Do not run' ' this without mrjob.tools.emr.terminate_idle_job_flows in your' ' crontab; job flows left idle can quickly become expensive!') option_parser = OptionParser(usage=usage, description=description) add_basic_opts(option_parser) # these aren't nicely broken down, just scrape specific options scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ( 'bootstrap_mrjob', 'label', 'owner', ), }) add_emr_connect_opts(option_parser) add_emr_launch_opts(option_parser) alphabetize_options(option_parser) return option_parser
def main(cl_args=None): usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"' description = ('Run a command on the master and all slaves of an EMR job' ' flow. Store stdout and stderr for results in OUTPUT_DIR.') option_parser = OptionParser(usage=usage, description=description) option_parser.add_option('-o', '--output-dir', dest='output_dir', default=None, help="Specify an output directory (default:" " JOB_FLOW_ID)") add_basic_opts(option_parser) add_emr_connect_opts(option_parser) scrape_options_into_new_groups(MRJob().all_option_groups(), { option_parser: ('ec2_key_pair_file', 'ssh_bin'), }) alphabetize_options(option_parser) options, args = option_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) runner_kwargs = options.__dict__.copy() for unused_arg in ('output_dir', 'quiet', 'verbose'): del runner_kwargs[unused_arg] if len(args) < 2: option_parser.print_help() sys.exit(1) job_flow_id, cmd_string = args[:2] cmd_args = shlex_split(cmd_string) output_dir = os.path.abspath(options.output_dir or job_flow_id) with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner: runner._enable_slave_ssh_access() run_on_all_nodes(runner, output_dir, cmd_args)
def configure_options(self): """Define arguments for this script. Called from :py:meth:`__init__()`. Run ``python -m mrjob.job.MRJob --help`` to see all options. Re-define to define custom command-line arguments:: def configure_options(self): super(MRYourJob, self).configure_options self.add_passthrough_option(...) self.add_file_option(...) ... """ self.option_parser.add_option( '--help', dest='help_main', action='store_true', default=False, help='show this message and exit') self.option_parser.add_option( '--help-emr', dest='help_emr', action='store_true', default=False, help='show EMR-related options') self.option_parser.add_option( '--help-hadoop', dest='help_hadoop', action='store_true', default=False, help='show Hadoop-related options') self.option_parser.add_option( '--help-local', dest='help_local', action='store_true', default=False, help='show local/inline runner-related options') self.option_parser.add_option( '--help-runner', dest='help_runner', action='store_true', default=False, help='show runner-related options') # protocol stuff self.proto_opt_group = OptionGroup( self.option_parser, 'Protocols') self.option_parser.add_option_group(self.proto_opt_group) add_protocol_opts(self.proto_opt_group) # options for running the entire job self.runner_opt_group = OptionGroup( self.option_parser, 'Running the entire job') self.option_parser.add_option_group(self.runner_opt_group) add_runner_opts(self.runner_opt_group, self._DEFAULT_RUNNER) add_basic_opts(self.runner_opt_group) # options for inline/local runners self.local_opt_group = OptionGroup( self.option_parser, 'Running locally (these apply when you set -r inline or -r local)') self.option_parser.add_option_group(self.local_opt_group) add_local_opts(self.local_opt_group) # options common to Hadoop and EMR self.hadoop_emr_opt_group = OptionGroup( self.option_parser, 'Running on Hadoop or EMR (these apply when you set -r hadoop or' ' -r emr)') self.option_parser.add_option_group(self.hadoop_emr_opt_group) add_hadoop_emr_opts(self.hadoop_emr_opt_group) # options for running the job on Hadoop self.hadoop_opt_group = OptionGroup( self.option_parser, 'Running on Hadoop (these apply when you set -r hadoop)') self.option_parser.add_option_group(self.hadoop_opt_group) add_hadoop_opts(self.hadoop_opt_group) # options for running the job on EMR self.emr_opt_group = OptionGroup( self.option_parser, 'Running on EMR (these apply when you set -r emr)') self.option_parser.add_option_group(self.emr_opt_group) add_emr_opts(self.emr_opt_group)
def make_option_parser(): usage = "%prog [options]" description = ( "Terminate idle EMR job flows that meet the criteria" " passed in on the command line (or, by default," " job flows that have been idle for one hour)." ) option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( "--max-hours-idle", dest="max_hours_idle", default=None, type="float", help=( "Max number of hours a job flow can go without bootstrapping," " running a step, or having a new step created. This will fire" " even if there are pending steps which EMR has failed to" " start. Make sure you set this higher than the amount of time" " your jobs can take to start instances and bootstrap." ), ) option_parser.add_option( "--max-mins-locked", dest="max_mins_locked", default=DEFAULT_MAX_MINUTES_LOCKED, type="float", help="Max number of minutes a job flow can be locked while idle.", ) option_parser.add_option( "--mins-to-end-of-hour", dest="mins_to_end_of_hour", default=None, type="float", help=( "Terminate job flows that are within this many minutes of" " the end of a full hour since the job started running" " AND have no pending steps." ), ) option_parser.add_option( "--unpooled-only", dest="unpooled_only", action="store_true", default=False, help="Only terminate un-pooled job flows", ) option_parser.add_option( "--pooled-only", dest="pooled_only", action="store_true", default=False, help="Only terminate pooled job flows" ) option_parser.add_option( "--pool-name", dest="pool_name", default=None, help="Only terminate job flows in the given named pool." ) option_parser.add_option( "--dry-run", dest="dry_run", default=False, action="store_true", help="Don't actually kill idle jobs; just log that we would", ) option_parser.add_option( "-t", "--test", dest="test", default=False, action="store_true", help="Don't actually delete any files; just log that we would", ) add_basic_opts(option_parser) return option_parser
def configure_options(self): """Define arguments for this script. Called from :py:meth:`__init__()`. Run ``python -m mrjob.job.MRJob --help`` to see all options. Re-define to define custom command-line arguments:: def configure_options(self): super(MRYourJob, self).configure_options self.add_passthrough_option(...) self.add_file_option(...) ... """ self.option_parser.add_option('--help', dest='help_main', action='store_true', default=False, help='show this message and exit') self.option_parser.add_option('--help-emr', dest='help_emr', action='store_true', default=False, help='show EMR-related options') self.option_parser.add_option('--help-hadoop', dest='help_hadoop', action='store_true', default=False, help='show Hadoop-related options') self.option_parser.add_option('--help-runner', dest='help_runner', action='store_true', default=False, help='show runner-related options') # protocol stuff self.proto_opt_group = OptionGroup(self.option_parser, 'Protocols') self.option_parser.add_option_group(self.proto_opt_group) add_protocol_opts(self.proto_opt_group) # options for running the entire job self.runner_opt_group = OptionGroup(self.option_parser, 'Running the entire job') self.option_parser.add_option_group(self.runner_opt_group) add_runner_opts(self.runner_opt_group, self._DEFAULT_RUNNER) add_basic_opts(self.runner_opt_group) self.hadoop_opts_opt_group = OptionGroup( self.option_parser, 'Configuring or emulating Hadoop (these apply when you set -r' ' hadoop, -r emr, or -r local)') self.option_parser.add_option_group(self.hadoop_opts_opt_group) add_hadoop_shared_opts(self.hadoop_opts_opt_group) # options common to Hadoop and EMR self.hadoop_emr_opt_group = OptionGroup( self.option_parser, 'Running on Hadoop or EMR (these apply when you set -r hadoop or' ' -r emr)') self.option_parser.add_option_group(self.hadoop_emr_opt_group) add_hadoop_emr_opts(self.hadoop_emr_opt_group) # options for running the job on Hadoop self.hadoop_opt_group = OptionGroup( self.option_parser, 'Running on Hadoop (these apply when you set -r hadoop)') self.option_parser.add_option_group(self.hadoop_opt_group) add_hadoop_opts(self.hadoop_opt_group) # options for running the job on EMR self.emr_opt_group = OptionGroup( self.option_parser, 'Running on Amazon Elastic MapReduce (these apply when you set -r' ' emr)') self.option_parser.add_option_group(self.emr_opt_group) add_emr_opts(self.emr_opt_group)
def configure_options(self): """Define arguments for this script. Called from :py:meth:`__init__()`. Run ``python -m mrjob.job.MRJob --help`` to see all options. Re-define to define custom command-line arguments:: def configure_options(self): super(MRYourJob, self).configure_options self.add_passthrough_option(...) self.add_file_option(...) ... """ self.option_parser.add_option( "--help", dest="help_main", action="store_true", default=False, help="show this message and exit" ) self.option_parser.add_option( "--help-emr", dest="help_emr", action="store_true", default=False, help="show EMR-related options" ) self.option_parser.add_option( "--help-hadoop", dest="help_hadoop", action="store_true", default=False, help="show Hadoop-related options" ) self.option_parser.add_option( "--help-runner", dest="help_runner", action="store_true", default=False, help="show runner-related options" ) # protocol stuff self.proto_opt_group = OptionGroup(self.option_parser, "Protocols") self.option_parser.add_option_group(self.proto_opt_group) add_protocol_opts(self.proto_opt_group) # options for running the entire job self.runner_opt_group = OptionGroup(self.option_parser, "Running the entire job") self.option_parser.add_option_group(self.runner_opt_group) add_runner_opts(self.runner_opt_group, self._DEFAULT_RUNNER) add_basic_opts(self.runner_opt_group) self.hadoop_opts_opt_group = OptionGroup( self.option_parser, "Configuring or emulating Hadoop (these apply when you set -r" " hadoop, -r emr, or -r local)", ) self.option_parser.add_option_group(self.hadoop_opts_opt_group) add_hadoop_shared_opts(self.hadoop_opts_opt_group) # options common to Hadoop and EMR self.hadoop_emr_opt_group = OptionGroup( self.option_parser, "Running on Hadoop or EMR (these apply when you set -r hadoop or" " -r emr)" ) self.option_parser.add_option_group(self.hadoop_emr_opt_group) add_hadoop_emr_opts(self.hadoop_emr_opt_group) # options for running the job on Hadoop self.hadoop_opt_group = OptionGroup( self.option_parser, "Running on Hadoop (these apply when you set -r hadoop)" ) self.option_parser.add_option_group(self.hadoop_opt_group) add_hadoop_opts(self.hadoop_opt_group) # options for running the job on EMR self.emr_opt_group = OptionGroup( self.option_parser, "Running on Amazon Elastic MapReduce (these apply when you set -r" " emr)" ) self.option_parser.add_option_group(self.emr_opt_group) add_emr_opts(self.emr_opt_group)
def make_option_parser(): usage = 'usage: %prog [options] JOB_FLOW_ID' description = ( 'List, display, and parse Hadoop logs associated with EMR job flows.' ' Useful for debugging failed jobs for which mrjob did not display a' ' useful error message or for inspecting jobs whose output has been' ' lost.') option_parser = OptionParser(usage=usage, description=description) add_basic_opts(option_parser) option_parser.add_option('-f', '--find-failure', dest='find_failure', action='store_true', default=False, help=('Search the logs for information about why' ' the job failed')) option_parser.add_option('-l', '--list', dest='list_relevant', action="store_true", default=False, help='List log files MRJob finds relevant') option_parser.add_option('-L', '--list-all', dest='list_all', action="store_true", default=False, help='List all log files') option_parser.add_option('-a', '--cat', dest='cat_relevant', action="store_true", default=False, help='Cat log files MRJob finds relevant') option_parser.add_option('-A', '--cat-all', dest='cat_all', action="store_true", default=False, help='Cat all log files to JOB_FLOW_ID/') option_parser.add_option('-s', '--step-num', dest='step_num', action='store', type='int', default=None, help=('Limit results to a single step. To be used' ' with --list and --cat.')) option_parser.add_option('--counters', dest='get_counters', action='store_true', default=False, help='Show counters from the job flow') add_emr_connect_opts(option_parser) scrape_options_into_new_groups( MRJob().all_option_groups(), {option_parser: ('ec2_key_pair_file', 's3_sync_wait_time', 'ssh_bin')}) alphabetize_options(option_parser) return option_parser