def make_option_parser():
    usage = '%prog [options]'
    description = ('Terminate idle EMR clusters that meet the criteria'
                   ' passed in on the command line (or, by default,'
                   ' clusters that have been idle for one hour).')

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        '--max-hours-idle', dest='max_hours_idle',
        default=None, type='float',
        help=('Max number of hours a cluster can go without bootstrapping,'
              ' running a step, or having a new step created. This will fire'
              ' even if there are pending steps which EMR has failed to'
              ' start. Make sure you set this higher than the amount of time'
              ' your jobs can take to start instances and bootstrap.'))
    option_parser.add_option(
        '--max-mins-locked', dest='max_mins_locked',
        default=DEFAULT_MAX_MINUTES_LOCKED, type='float',
        help='Max number of minutes a cluster can be locked while idle.')
    option_parser.add_option(
        '--mins-to-end-of-hour', dest='mins_to_end_of_hour',
        default=None, type='float',
        help=('Terminate clusters that are within this many minutes of'
              ' the end of a full hour since the job started running'
              ' AND have no pending steps.'))
    option_parser.add_option(
        '--unpooled-only', dest='unpooled_only', action='store_true',
        default=False,
        help='Only terminate un-pooled clusters')
    option_parser.add_option(
        '--pooled-only', dest='pooled_only', action='store_true',
        default=False,
        help='Only terminate pooled clusters')
    option_parser.add_option(
        '--pool-name', dest='pool_name', default=None,
        help='Only terminate clusters in the given named pool.')
    option_parser.add_option(
        '--dry-run', dest='dry_run', default=False,
        action='store_true',
        help="Don't actually kill idle jobs; just log that we would")

    option_parser.add_option(
        '-t', '--test', dest='test', default=False,
        action='store_true',
        help="Don't actually delete any files; just log that we would")

    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)
    alphabetize_options(option_parser)

    return option_parser
def make_option_parser():
    usage = '%prog [options]'
    description = ('Terminate idle EMR job flows that meet the criteria'
                   ' passed in on the command line (or, by default,'
                   ' job flows that have been idle for one hour).')

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        '--max-hours-idle', dest='max_hours_idle',
        default=None, type='float',
        help=('Max number of hours a job flow can go without bootstrapping,'
              ' running a step, or having a new step created. This will fire'
              ' even if there are pending steps which EMR has failed to'
              ' start. Make sure you set this higher than the amount of time'
              ' your jobs can take to start instances and bootstrap.'))
    option_parser.add_option(
        '--max-mins-locked', dest='max_mins_locked',
        default=DEFAULT_MAX_MINUTES_LOCKED, type='float',
        help='Max number of minutes a job flow can be locked while idle.')
    option_parser.add_option(
        '--mins-to-end-of-hour', dest='mins_to_end_of_hour',
        default=None, type='float',
        help=('Terminate job flows that are within this many minutes of'
              ' the end of a full hour since the job started running'
              ' AND have no pending steps.'))
    option_parser.add_option(
        '--unpooled-only', dest='unpooled_only', action='store_true',
        default=False,
        help='Only terminate un-pooled job flows')
    option_parser.add_option(
        '--pooled-only', dest='pooled_only', action='store_true',
        default=False,
        help='Only terminate pooled job flows')
    option_parser.add_option(
        '--pool-name', dest='pool_name', default=None,
        help='Only terminate job flows in the given named pool.')
    option_parser.add_option(
        '--dry-run', dest='dry_run', default=False,
        action='store_true',
        help="Don't actually kill idle jobs; just log that we would")

    option_parser.add_option(
        '-t', '--test', dest='test', default=False,
        action='store_true',
        help="Don't actually delete any files; just log that we would")

    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)
    alphabetize_options(option_parser)

    return option_parser
Example #3
0
def make_option_parser():
    usage = 'usage: %prog [options] JOB_FLOW_ID'
    description = (
        'List, display, and parse Hadoop logs associated with EMR job flows.'
        ' Useful for debugging failed jobs for which mrjob did not display a'
        ' useful error message or for inspecting jobs whose output has been'
        ' lost.')

    option_parser = OptionParser(usage=usage, description=description)

    add_basic_opts(option_parser)

    option_parser.add_option('-f', '--find-failure', dest='find_failure',
                             action='store_true', default=False,
                             help=('Search the logs for information about why'
                                   ' the job failed'))
    option_parser.add_option('-l', '--list', dest='list_relevant',
                             action="store_true", default=False,
                             help='List log files MRJob finds relevant')

    option_parser.add_option('-L', '--list-all', dest='list_all',
                             action="store_true", default=False,
                             help='List all log files')

    option_parser.add_option('-a', '--cat', dest='cat_relevant',
                             action="store_true", default=False,
                             help='Cat log files MRJob finds relevant')

    option_parser.add_option('-A', '--cat-all', dest='cat_all',
                             action="store_true", default=False,
                             help='Cat all log files to JOB_FLOW_ID/')

    option_parser.add_option('-s', '--step-num', dest='step_num',
                             action='store', type='int', default=None,
                             help=('Limit results to a single step. To be used'
                                   ' with --list and --cat.'))
    option_parser.add_option('--counters', dest='get_counters',
                             action='store_true', default=False,
                             help='Show counters from the job flow')

    add_emr_connect_opts(option_parser)

    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: ('ec2_key_pair_file', 's3_sync_wait_time', 'ssh_bin')
    })

    alphabetize_options(option_parser)

    return option_parser
Example #4
0
def make_option_parser():
    usage = '%prog [options] jobflowid'
    description = 'Terminate an existing EMR job flow.'

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        '-t', '--test', dest='test', default=False,
        action='store_true',
        help="Don't actually delete any files; just log that we would")

    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)
    alphabetize_options(option_parser)

    return option_parser
Example #5
0
def make_option_parser():
    usage = '%prog [options] cluster-id'
    description = 'Terminate an existing EMR cluster.'

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        '-t', '--test', dest='test', default=False,
        action='store_true',
        help="Don't actually delete any files; just log that we would")

    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)
    alphabetize_options(option_parser)

    return option_parser
Example #6
0
def make_option_parser():
    usage = '%prog [options]'
    description = 'Print a giant report on EMR usage.'

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        '--max-days-ago', dest='max_days_ago', type='float', default=None,
        help=('Max number of days ago to look at jobs. By default, we go back'
              ' as far as EMR supports (currently about 2 months)'))

    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)

    alphabetize_options(option_parser)

    return option_parser
Example #7
0
def make_option_parser():
    usage = '%prog [options]'
    description = 'Print a giant report on EMR usage.'

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        '--max-days-ago', dest='max_days_ago', type='float', default=None,
        help=('Max number of days ago to look at jobs. By default, we go back'
              ' as far as EMR supports (currently about 2 months)'))

    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)

    alphabetize_options(option_parser)

    return option_parser
Example #8
0
def make_option_parser():
    usage = '%prog [options]'
    description = ('Report jobs running for more than a certain number of'
                   ' hours (by default, %.1f). This can help catch buggy jobs'
                   ' and Hadoop/EMR operational issues.' % DEFAULT_MIN_HOURS)

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        '--min-hours', dest='min_hours', type='float',
        default=DEFAULT_MIN_HOURS,
        help=('Minimum number of hours a job can run before we report it.'
              ' Default: %default'))

    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)

    alphabetize_options(option_parser)

    return option_parser
def make_option_parser():
    usage = '%prog [options]'
    description = ('Report jobs running for more than a certain number of'
                   ' hours (by default, %.1f). This can help catch buggy jobs'
                   ' and Hadoop/EMR operational issues.' % DEFAULT_MIN_HOURS)

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        '--min-hours', dest='min_hours', type='float',
        default=DEFAULT_MIN_HOURS,
        help=('Minimum number of hours a job can run before we report it.'
              ' Default: %default'))

    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)

    alphabetize_options(option_parser)

    return option_parser
Example #10
0
def main(cl_args=None):
    usage = 'usage: %prog CLUSTER_ID OUTPUT_DIR [options] "command string"'
    description = ('Run a command on the master and all slaves of an EMR'
                   ' cluster. Store stdout/stderr for results in OUTPUT_DIR.')

    option_parser = OptionParser(usage=usage, description=description)
    option_parser.add_option('-o',
                             '--output-dir',
                             dest='output_dir',
                             default=None,
                             help="Specify an output directory (default:"
                             " CLUSTER_ID)")
    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)
    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: ('ec2_key_pair_file', 'ssh_bin'),
    })
    alphabetize_options(option_parser)

    options, args = option_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    runner_kwargs = options.__dict__.copy()
    for unused_arg in ('output_dir', 'quiet', 'verbose'):
        del runner_kwargs[unused_arg]

    if len(args) < 2:
        option_parser.print_help()
        sys.exit(1)

    cluster_id, cmd_string = args[:2]
    cmd_args = shlex_split(cmd_string)

    output_dir = os.path.abspath(options.output_dir or cluster_id)

    with EMRJobRunner(cluster_id=cluster_id, **runner_kwargs) as runner:
        runner._enable_slave_ssh_access()
        run_on_all_nodes(runner, output_dir, cmd_args)
Example #11
0
def make_option_parser():
    usage = '%prog [options]'
    description = (
        'Create a persistent EMR job flow to run jobs in, and print its ID to'
        ' stdout. WARNING: Do not run'
        ' this without mrjob.tools.emr.terminate_idle_job_flows in your'
        ' crontab; job flows left idle can quickly become expensive!')
    option_parser = OptionParser(usage=usage, description=description)

    add_basic_opts(option_parser)
    # these aren't nicely broken down, just scrape specific options
    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: (
            'bootstrap_mrjob',
            'label',
            'owner',
        ),
    })
    add_emr_connect_opts(option_parser)
    add_emr_launch_opts(option_parser)

    alphabetize_options(option_parser)
    return option_parser
Example #12
0
def make_option_parser():
    usage = "%prog [options]"
    description = "Print a giant report on EMR usage."

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        "--max-days-ago",
        dest="max_days_ago",
        type="float",
        default=None,
        help=(
            "Max number of days ago to look at jobs. By default, we go back"
            " as far as EMR supports (currently about 2 months)"
        ),
    )

    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)

    alphabetize_options(option_parser)

    return option_parser
Example #13
0
def main(cl_args=None):
    usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"'
    description = ('Run a command on the master and all slaves of an EMR job'
                   ' flow. Store stdout and stderr for results in OUTPUT_DIR.')

    option_parser = OptionParser(usage=usage, description=description)
    option_parser.add_option('-o', '--output-dir', dest='output_dir',
                             default=None,
                             help="Specify an output directory (default:"
                             " JOB_FLOW_ID)")
    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)
    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: ('ec2_key_pair_file', 'ssh_bin'),
    })
    alphabetize_options(option_parser)

    options, args = option_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    runner_kwargs = options.__dict__.copy()
    for unused_arg in ('output_dir', 'quiet', 'verbose'):
        del runner_kwargs[unused_arg]

    if len(args) < 2:
        option_parser.print_help()
        sys.exit(1)

    job_flow_id, cmd_string = args[:2]
    cmd_args = shlex_split(cmd_string)

    output_dir = os.path.abspath(options.output_dir or job_flow_id)

    with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner:
        runner._enable_slave_ssh_access()
        run_on_all_nodes(runner, output_dir, cmd_args)
Example #14
0
def make_option_parser():
    usage = 'usage: %prog [options] JOB_FLOW_ID'
    description = (
        'List, display, and parse Hadoop logs associated with EMR job flows.'
        ' Useful for debugging failed jobs for which mrjob did not display a'
        ' useful error message or for inspecting jobs whose output has been'
        ' lost.')

    option_parser = OptionParser(usage=usage, description=description)

    add_basic_opts(option_parser)

    option_parser.add_option('-f',
                             '--find-failure',
                             dest='find_failure',
                             action='store_true',
                             default=False,
                             help=('Search the logs for information about why'
                                   ' the job failed'))
    option_parser.add_option('-l',
                             '--list',
                             dest='list_relevant',
                             action="store_true",
                             default=False,
                             help='List log files MRJob finds relevant')

    option_parser.add_option('-L',
                             '--list-all',
                             dest='list_all',
                             action="store_true",
                             default=False,
                             help='List all log files')

    option_parser.add_option('-a',
                             '--cat',
                             dest='cat_relevant',
                             action="store_true",
                             default=False,
                             help='Cat log files MRJob finds relevant')

    option_parser.add_option('-A',
                             '--cat-all',
                             dest='cat_all',
                             action="store_true",
                             default=False,
                             help='Cat all log files to JOB_FLOW_ID/')

    option_parser.add_option('-s',
                             '--step-num',
                             dest='step_num',
                             action='store',
                             type='int',
                             default=None,
                             help=('Limit results to a single step. To be used'
                                   ' with --list and --cat.'))
    option_parser.add_option('--counters',
                             dest='get_counters',
                             action='store_true',
                             default=False,
                             help='Show counters from the job flow')

    add_emr_connect_opts(option_parser)

    scrape_options_into_new_groups(
        MRJob().all_option_groups(),
        {option_parser: ('ec2_key_pair_file', 's3_sync_wait_time', 'ssh_bin')})

    alphabetize_options(option_parser)

    return option_parser