Beispiel #1
0
def _make_option_parser():
    usage = '%prog [options] <time-untouched> <URIs>'
    description = (
        'Delete all files in a given URI that are older than a specified'
        ' time.\n\nThe time parameter defines the threshold for removing'
        ' files. If the file has not been accessed for *time*, the file is'
        ' removed. The time argument is a number with an optional'
        ' single-character suffix specifying the units: m for minutes, h for'
        ' hours, d for days.  If no suffix is specified, time is in hours.')

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        '-t',
        '--test',
        dest='test',
        default=False,
        action='store_true',
        help="Don't actually delete any files; just log that we would")

    _add_basic_opts(option_parser)
    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: ('aws_region', 's3_endpoint'),
    })

    _alphabetize_options(option_parser)

    return option_parser
Beispiel #2
0
def _make_option_parser():
    usage = '%prog [options]'
    description = (
        'Create a persistent EMR cluster to run jobs in, and print its ID to'
        ' stdout. WARNING: Do not run'
        ' this without mrjob terminate-idle-clusters in your'
        ' crontab; clusters left idle can quickly become expensive!')
    option_parser = OptionParser(usage=usage, description=description)

    _add_basic_opts(option_parser)
    # these aren't nicely broken down, just scrape specific options
    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: (
            'bootstrap_mrjob',
            'label',
            'owner',
        ),
    })

    _add_emr_connect_opts(option_parser)
    _add_emr_launch_opts(option_parser)
    _add_dataproc_emr_opts(option_parser)

    _alphabetize_options(option_parser)
    return option_parser
Beispiel #3
0
def make_option_parser():
    usage = '%prog [options] <time-untouched> <URIs>'
    description = (
        'Delete all files in a given URI that are older than a specified'
        ' time.\n\nThe time parameter defines the threshold for removing'
        ' files. If the file has not been accessed for *time*, the file is'
        ' removed. The time argument is a number with an optional'
        ' single-character suffix specifying the units: m for minutes, h for'
        ' hours, d for days.  If no suffix is specified, time is in hours.')

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option(
        '-t', '--test', dest='test', default=False,
        action='store_true',
        help="Don't actually delete any files; just log that we would")

    add_basic_opts(option_parser)
    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: ('aws_region', 's3_endpoint'),
    })

    alphabetize_options(option_parser)

    return option_parser
Beispiel #4
0
def _make_option_parser():
    usage = '%prog [options]'
    description = (
        'Create a persistent EMR cluster to run jobs in, and print its ID to'
        ' stdout. WARNING: Do not run'
        ' this without mrjob terminate-idle-clusters in your'
        ' crontab; clusters left idle can quickly become expensive!')
    option_parser = OptionParser(usage=usage, description=description)

    _add_basic_opts(option_parser)
    # these aren't nicely broken down, just scrape specific options
    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: (
            'bootstrap_mrjob',
            'label',
            'owner',
        ),
    })

    _add_emr_connect_opts(option_parser)
    _add_emr_launch_opts(option_parser)
    _add_dataproc_emr_opts(option_parser)

    _alphabetize_options(option_parser)
    return option_parser
Beispiel #5
0
def make_option_parser():
    usage = '%prog [options]'
    description = (
        'Create a persistent EMR job flow to run jobs in. WARNING: do not run'
        ' this without mrjob.tools.emr.terminate.idle_job_flows in your'
        ' crontab; job flows left idle can quickly become expensive!')
    option_parser = OptionParser(usage=usage, description=description)

    def make_option_group(halp):
        g = OptionGroup(option_parser, halp)
        option_parser.add_option_group(g)
        return g

    runner_group = make_option_group('Running the entire job')
    hadoop_emr_opt_group = make_option_group(
        'Running on Hadoop or EMR (these apply when you set -r hadoop or -r'
        ' emr)')
    emr_opt_group = make_option_group(
        'Running on Amazon Elastic MapReduce (these apply when you set -r'
        ' emr)')

    assignments = {
        runner_group: ('bootstrap_mrjob', 'conf_path', 'quiet', 'verbose'),
        hadoop_emr_opt_group: (
            'label',
            'owner',
        ),
        emr_opt_group: (
            'additional_emr_info',
            'aws_availability_zone',
            'aws_region',
            'bootstrap_actions',
            'bootstrap_cmds',
            'bootstrap_files',
            'bootstrap_python_packages',
            'ec2_instance_type',
            'ec2_key_pair',
            'ec2_master_instance_type',
            'ec2_slave_instance_type',
            'emr_endpoint',
            'emr_job_flow_pool_name',
            'enable_emr_debugging',
            'hadoop_version',
            'num_ec2_instances',
            'pool_emr_job_flows',
            's3_endpoint',
            's3_log_uri',
            's3_scratch_uri',
            's3_sync_wait_time',
        ),
    }

    # Scrape options from MRJob and index them by dest
    mr_job = MRJob()
    job_option_groups = mr_job.all_option_groups()
    scrape_options_into_new_groups(job_option_groups, assignments)
    return option_parser
Beispiel #6
0
def make_option_parser():
    usage = '%prog [options]'
    description = 'Create a persistent EMR job flow to run jobs in. WARNING: do not run this without mrjob.tools.emr.terminate.idle_job_flows in your crontab; job flows left idle can quickly become expensive!'
    option_parser = OptionParser(usage=usage, description=description)

    def make_option_group(halp):
        g = OptionGroup(option_parser, halp)
        option_parser.add_option_group(g)
        return g

    runner_group = make_option_group('Running the entire job')
    hadoop_emr_opt_group = make_option_group('Running on Hadoop or EMR (these apply when you set -r hadoop or -r emr)')
    emr_opt_group = make_option_group('Running on Amazon Elastic MapReduce (these apply when you set -r emr)')

    assignments = {
        runner_group: (
            'bootstrap_mrjob',
            'conf_path',
            'quiet',
            'verbose'
        ),
        hadoop_emr_opt_group: (
            'label',
            'owner',
        ),
        emr_opt_group: (
            'additional_emr_info',
            'aws_availability_zone',
            'aws_region',
            'bootstrap_actions',
            'bootstrap_cmds',
            'bootstrap_files',
            'bootstrap_python_packages',
            'ec2_instance_type',
            'ec2_key_pair',
            'ec2_master_instance_type',
            'ec2_slave_instance_type',
            'emr_endpoint',
            'enable_emr_debugging',
            'hadoop_version',
            'num_ec2_instances',
            's3_endpoint',
            's3_log_uri',
            's3_scratch_uri',
            's3_sync_wait_time',
        ),
    }

    # Scrape options from MRJob and index them by dest
    mr_job = MRJob()
    job_option_groups = (mr_job.option_parser, mr_job.mux_opt_group,
                         mr_job.proto_opt_group, mr_job.runner_opt_group,
                         mr_job.hadoop_emr_opt_group, mr_job.emr_opt_group)
    scrape_options_into_new_groups(job_option_groups, assignments)
    return option_parser
Beispiel #7
0
 def test_scrape_all(self):
     assignments = {
         self.new_parser: ('a',),
         self.new_group_1: ('x', 'y'),
     }
     old_groups = (self.original_parser, self.original_group)
     scrape_options_into_new_groups(old_groups, assignments)
     self.assertEqual(self.original_parser.option_list[1:],
                      self.new_parser.option_list[1:])
     self.assertEqual(self.original_group.option_list,
                      self.new_group_1.option_list)
Beispiel #8
0
 def test_scrape_all(self):
     assignments = {
         self.new_parser: ('a', ),
         self.new_group_1: ('x', 'y'),
     }
     old_groups = (self.original_parser, self.original_group)
     scrape_options_into_new_groups(old_groups, assignments)
     self.assertEqual(self.original_parser.option_list[1:],
                      self.new_parser.option_list[1:])
     self.assertEqual(self.original_group.option_list,
                      self.new_group_1.option_list)
Beispiel #9
0
 def test_scrape_different(self):
     assignments = {self.new_parser: ("x",), self.new_group_1: ("y",), self.new_group_2: ("a",)}
     old_groups = (self.original_parser, self.original_group)
     scrape_options_into_new_groups(old_groups, assignments)
     target_1 = self.original_group.option_list[:1]
     target_2 = self.original_group.option_list[1:]
     target_3 = self.original_parser.option_list[1:]
     self.assertEqual(target_1, self.new_parser.option_list[1:])
     self.assertEqual(target_2, self.new_group_1.option_list)
     self.assertEqual(target_3, self.new_group_2.option_list)
     options, args = self.new_parser.parse_args(["-x", "happy"])
     self.assertEqual(options.x, "happy")
Beispiel #10
0
def make_option_parser():
    usage = 'usage: %prog [options] JOB_FLOW_ID'
    description = (
        'List, display, and parse Hadoop logs associated with EMR job flows.'
        ' Useful for debugging failed jobs for which mrjob did not display a'
        ' useful error message or for inspecting jobs whose output has been'
        ' lost.')

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option('-f', '--find-failure', dest='find_failure',
                             action='store_true', default=False,
                             help=('Search the logs for information about why'
                                   ' the job failed'))
    option_parser.add_option('-l', '--list', dest='list_relevant',
                             action="store_true", default=False,
                             help='List log files MRJob finds relevant')

    option_parser.add_option('-L', '--list-all', dest='list_all',
                             action="store_true", default=False,
                             help='List all log files')

    option_parser.add_option('-a', '--cat', dest='cat_relevant',
                             action="store_true", default=False,
                             help='Cat log files MRJob finds relevant')

    option_parser.add_option('-A', '--cat-all', dest='cat_all',
                             action="store_true", default=False,
                             help='Cat all log files to JOB_FLOW_ID/')

    option_parser.add_option('-s', '--step-num', dest='step_num',
                             action='store', type='int', default=None,
                             help=('Limit results to a single step. To be used'
                                   ' with --list and --cat.'))
    option_parser.add_option('--counters', dest='get_counters',
                             action='store_true', default=False,
                             help='Show counters from the job flow')

    assignments = {
        option_parser: ('conf_paths', 'quiet', 'verbose',
                        'ec2_key_pair_file', 's3_sync_wait_time')
    }

    mr_job = MRJob()
    job_option_groups = (mr_job.option_parser, mr_job.mux_opt_group,
                         mr_job.proto_opt_group, mr_job.runner_opt_group,
                         mr_job.hadoop_emr_opt_group, mr_job.emr_opt_group,
                         mr_job.hadoop_opts_opt_group)
    scrape_options_into_new_groups(job_option_groups, assignments)
    return option_parser
Beispiel #11
0
def make_option_parser():
    usage = 'usage: %prog [options] JOB_FLOW_ID'
    description = (
        'List, display, and parse Hadoop logs associated with EMR job flows.'
        ' Useful for debugging failed jobs for which mrjob did not display a'
        ' useful error message or for inspecting jobs whose output has been'
        ' lost.')

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option('-f', '--find-failure', dest='find_failure',
                             action='store_true', default=False,
                             help=('Search the logs for information about why'
                                   ' the job failed'))
    option_parser.add_option('-l', '--list', dest='list_relevant',
                             action="store_true", default=False,
                             help='List log files MRJob finds relevant')

    option_parser.add_option('-L', '--list-all', dest='list_all',
                             action="store_true", default=False,
                             help='List all log files')

    option_parser.add_option('-a', '--cat', dest='cat_relevant',
                             action="store_true", default=False,
                             help='Cat log files MRJob finds relevant')

    option_parser.add_option('-A', '--cat-all', dest='cat_all',
                             action="store_true", default=False,
                             help='Cat all log files to JOB_FLOW_ID/')

    option_parser.add_option('-s', '--step-num', dest='step_num',
                             action='store', type='int', default=None,
                             help=('Limit results to a single step. To be used'
                                   ' with --list and --cat.'))
    option_parser.add_option('--counters', dest='get_counters',
                             action='store_true', default=False,
                             help='Show counters from the job flow')

    assignments = {
        option_parser: ('conf_paths', 'quiet', 'verbose',
                        'ec2_key_pair_file', 's3_sync_wait_time')
    }

    mr_job = MRJob()
    job_option_groups = (mr_job.option_parser, mr_job.mux_opt_group,
                         mr_job.proto_opt_group, mr_job.runner_opt_group,
                         mr_job.hadoop_emr_opt_group, mr_job.emr_opt_group,
                         mr_job.hadoop_opts_opt_group)
    scrape_options_into_new_groups(job_option_groups, assignments)
    return option_parser
Beispiel #12
0
def make_option_parser():
    usage = 'usage: %prog [options] JOB_FLOW_ID'
    description = (
        'List, display, and parse Hadoop logs associated with EMR job flows.'
        ' Useful for debugging failed jobs for which mrjob did not display a'
        ' useful error message or for inspecting jobs whose output has been'
        ' lost.')

    option_parser = OptionParser(usage=usage, description=description)

    add_basic_opts(option_parser)

    option_parser.add_option('-f', '--find-failure', dest='find_failure',
                             action='store_true', default=False,
                             help=('Search the logs for information about why'
                                   ' the job failed'))
    option_parser.add_option('-l', '--list', dest='list_relevant',
                             action="store_true", default=False,
                             help='List log files MRJob finds relevant')

    option_parser.add_option('-L', '--list-all', dest='list_all',
                             action="store_true", default=False,
                             help='List all log files')

    option_parser.add_option('-a', '--cat', dest='cat_relevant',
                             action="store_true", default=False,
                             help='Cat log files MRJob finds relevant')

    option_parser.add_option('-A', '--cat-all', dest='cat_all',
                             action="store_true", default=False,
                             help='Cat all log files to JOB_FLOW_ID/')

    option_parser.add_option('-s', '--step-num', dest='step_num',
                             action='store', type='int', default=None,
                             help=('Limit results to a single step. To be used'
                                   ' with --list and --cat.'))
    option_parser.add_option('--counters', dest='get_counters',
                             action='store_true', default=False,
                             help='Show counters from the job flow')

    add_emr_connect_opts(option_parser)

    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: ('ec2_key_pair_file', 's3_sync_wait_time', 'ssh_bin')
    })

    alphabetize_options(option_parser)

    return option_parser
Beispiel #13
0
 def test_scrape_different(self):
     assignments = {
         self.new_parser: ('x',),
         self.new_group_1: ('y',),
         self.new_group_2: ('a',),
     }
     old_groups = (self.original_parser, self.original_group)
     scrape_options_into_new_groups(old_groups, assignments)
     target_1 = self.original_group.option_list[:1]
     target_2 = self.original_group.option_list[1:]
     target_3 = self.original_parser.option_list[1:]
     assert_equal(target_1, self.new_parser.option_list[1:])
     assert_equal(target_2, self.new_group_1.option_list)
     assert_equal(target_3, self.new_group_2.option_list)
     options, args = self.new_parser.parse_args(['-x', 'happy'])
     assert_equal(options.x, 'happy')
Beispiel #14
0
 def test_scrape_different(self):
     assignments = {
         self.new_parser: ('x', ),
         self.new_group_1: ('y', ),
         self.new_group_2: ('a', ),
     }
     old_groups = (self.original_parser, self.original_group)
     scrape_options_into_new_groups(old_groups, assignments)
     target_1 = self.original_group.option_list[:1]
     target_2 = self.original_group.option_list[1:]
     target_3 = self.original_parser.option_list[1:]
     self.assertEqual(target_1, self.new_parser.option_list[1:])
     self.assertEqual(target_2, self.new_group_1.option_list)
     self.assertEqual(target_3, self.new_group_2.option_list)
     options, args = self.new_parser.parse_args(['-x', 'happy'])
     self.assertEqual(options.x, 'happy')
Beispiel #15
0
def main():
    usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"'
    description = ('Run a command on the master and all slaves of an EMR job'
                   ' flow. Store stdout and stderr for results in OUTPUT_DIR.')

    option_parser = OptionParser(usage=usage, description=description)

    assignments = {
        option_parser: ('conf_path', 'quiet', 'verbose', 'ec2_key_pair_file')
    }

    option_parser.add_option('-o',
                             '--output-dir',
                             dest='output_dir',
                             default=None,
                             help="Specify an output directory (default:"
                             " JOB_FLOW_ID)")

    mr_job = MRJob()
    scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)

    options, args = option_parser.parse_args()

    if not options.quiet:
        log_to_stream(name='mrjob', debug=options.verbose)

    runner_kwargs = options.__dict__.copy()
    for unused_arg in ('output_dir', 'quiet', 'verbose'):
        del runner_kwargs[unused_arg]

    if len(args) < 2:
        option_parser.print_help()
        sys.exit(1)

    job_flow_id, cmd_string = args[:2]
    cmd_args = shlex.split(cmd_string)

    output_dir = os.path.abspath(options.output_dir or job_flow_id)

    with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner:
        runner._enable_slave_ssh_access()
        run_on_all_nodes(runner, output_dir, cmd_args)
Beispiel #16
0
def main():
    usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"'
    description = ('Run a command on the master and all slaves of an EMR job'
                   ' flow. Store stdout and stderr for results in OUTPUT_DIR.')

    option_parser = OptionParser(usage=usage, description=description)

    assignments = {
        option_parser: ('conf_path', 'quiet', 'verbose',
                        'ec2_key_pair_file')
    }

    option_parser.add_option('-o', '--output-dir', dest='output_dir',
                             default=None,
                             help="Specify an output directory (default:"
                             " JOB_FLOW_ID)")

    mr_job = MRJob()
    scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)

    options, args = option_parser.parse_args()

    if not options.quiet:
        log_to_stream(name='mrjob', debug=options.verbose)

    runner_kwargs = options.__dict__.copy()
    for unused_arg in ('output_dir', 'quiet', 'verbose'):
        del runner_kwargs[unused_arg]

    if len(args) < 2:
        option_parser.print_help()
        sys.exit(1)

    job_flow_id, cmd_string = args[:2]
    cmd_args = shlex.split(cmd_string)

    output_dir = os.path.abspath(options.output_dir or job_flow_id)

    with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner:
        runner._enable_slave_ssh_access()
        run_on_all_nodes(runner, output_dir, cmd_args)
Beispiel #17
0
def main(cl_args=None):
    usage = 'usage: %prog CLUSTER_ID OUTPUT_DIR [options] "command string"'
    description = ('Run a command on the master and all slaves of an EMR'
                   ' cluster. Store stdout/stderr for results in OUTPUT_DIR.')

    option_parser = OptionParser(usage=usage, description=description)
    option_parser.add_option('-o',
                             '--output-dir',
                             dest='output_dir',
                             default=None,
                             help="Specify an output directory (default:"
                             " CLUSTER_ID)")
    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)
    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: ('ec2_key_pair_file', 'ssh_bin'),
    })
    alphabetize_options(option_parser)

    options, args = option_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    runner_kwargs = options.__dict__.copy()
    for unused_arg in ('output_dir', 'quiet', 'verbose'):
        del runner_kwargs[unused_arg]

    if len(args) < 2:
        option_parser.print_help()
        sys.exit(1)

    cluster_id, cmd_string = args[:2]
    cmd_args = shlex_split(cmd_string)

    output_dir = os.path.abspath(options.output_dir or cluster_id)

    with EMRJobRunner(cluster_id=cluster_id, **runner_kwargs) as runner:
        runner._enable_slave_ssh_access()
        run_on_all_nodes(runner, output_dir, cmd_args)
Beispiel #18
0
def main(cl_args=None):
    usage = 'usage: %prog JOB_FLOW_ID OUTPUT_DIR [options] "command string"'
    description = ('Run a command on the master and all slaves of an EMR job'
                   ' flow. Store stdout and stderr for results in OUTPUT_DIR.')

    option_parser = OptionParser(usage=usage, description=description)
    option_parser.add_option('-o', '--output-dir', dest='output_dir',
                             default=None,
                             help="Specify an output directory (default:"
                             " JOB_FLOW_ID)")
    add_basic_opts(option_parser)
    add_emr_connect_opts(option_parser)
    scrape_options_into_new_groups(MRJob().all_option_groups(), {
        option_parser: ('ec2_key_pair_file', 'ssh_bin'),
    })
    alphabetize_options(option_parser)

    options, args = option_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    runner_kwargs = options.__dict__.copy()
    for unused_arg in ('output_dir', 'quiet', 'verbose'):
        del runner_kwargs[unused_arg]

    if len(args) < 2:
        option_parser.print_help()
        sys.exit(1)

    job_flow_id, cmd_string = args[:2]
    cmd_args = shlex_split(cmd_string)

    output_dir = os.path.abspath(options.output_dir or job_flow_id)

    with EMRJobRunner(emr_job_flow_id=job_flow_id, **runner_kwargs) as runner:
        runner._enable_slave_ssh_access()
        run_on_all_nodes(runner, output_dir, cmd_args)
Beispiel #19
0
def make_option_parser():
    usage = '%prog [options]'
    description = (
        'Inspect available job flow pools or identify job flows suitable for'
        ' running a job with the specified options.')
    option_parser = OptionParser(usage=usage, description=description)

    def make_option_group(halp):
        g = OptionGroup(option_parser, halp)
        option_parser.add_option_group(g)
        return g

    ec2_opt_group = make_option_group('EC2 instance configuration')
    hadoop_opt_group = make_option_group('Hadoop configuration')
    job_opt_group = make_option_group('Job flow configuration')

    assignments = {
        option_parser: (
            'conf_paths',
            'emr_job_flow_pool_name',
            'quiet',
            'verbose',
        ),
        ec2_opt_group: (
            'aws_availability_zone',
            'ec2_instance_type',
            'ec2_key_pair',
            'ec2_key_pair_file',
            'ec2_master_instance_type',
            'ec2_core_instance_type',
            'emr_endpoint',
            'num_ec2_instances',
        ),
        hadoop_opt_group: (
            'hadoop_version',
            'label',
            'owner',
        ),
        job_opt_group: (
            'bootstrap_actions',
            'bootstrap_cmds',
            'bootstrap_files',
            'bootstrap_mrjob',
            'bootstrap_python_packages',
        ),
    }

    option_parser.add_option('-a', '--all', action='store_true',
                             default=False, dest='list_all',
                             help=('List all available job flows without'
                                   ' filtering by configuration'))
    option_parser.add_option('-f', '--find', action='store_true',
                             default=False, dest='find',
                             help=('Find a job flow matching the pool name,'
                                   ' bootstrap configuration, and instance'
                                   ' number/type as specified on the command'
                                   ' line and in the configuration files'))
    option_parser.add_option('-t', '--terminate', action='store',
                             default=None, dest='terminate',
                             metavar='JOB_FLOW_ID',
                             help=('Terminate all job flows in the given pool'
                                   ' (defaults to pool "default")'))

    # Scrape options from MRJob and index them by dest
    mr_job = MRJob()
    scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)
    return option_parser
Beispiel #20
0
def main():
    usage = '%prog [options]'
    description = (
        'Inspect available job flow pools or identify job flows suitable for'
        ' running a job with the specified options.')
    option_parser = OptionParser(usage=usage, description=description)

    import boto.emr.connection
    boto.emr.connection.JobFlow.Fields.add('HadoopVersion')

    def make_option_group(halp):
        g = OptionGroup(option_parser, halp)
        option_parser.add_option_group(g)
        return g

    ec2_opt_group = make_option_group('EC2 instance configuration')
    hadoop_opt_group = make_option_group('Hadoop configuration')
    job_opt_group = make_option_group('Job flow configuration')

    assignments = {
        option_parser: (
            'conf_path',
            'emr_job_flow_pool_name',
            'quiet',
            'verbose',
        ),
        ec2_opt_group: (
            'aws_availability_zone',
            'ec2_instance_type',
            'ec2_key_pair',
            'ec2_key_pair_file',
            'ec2_master_instance_type',
            'ec2_slave_instance_type',
            'emr_endpoint',
            'num_ec2_instances',
        ),
        hadoop_opt_group: (
            'hadoop_version',
            'label',
            'owner',
        ),
        job_opt_group: (
            'bootstrap_actions',
            'bootstrap_cmds',
            'bootstrap_files',
            'bootstrap_mrjob',
            'bootstrap_python_packages',
        ),
    }

    option_parser.add_option('-a', '--all', action='store_true',
                             default=False, dest='list_all',
                             help=('List all available job flows without'
                                   ' filtering by configuration'))
    option_parser.add_option('-f', '--find', action='store_true',
                             default=False, dest='find',
                             help=('Find a job flow matching the pool name,'
                                   ' bootstrap configuration, and instance'
                                   ' number/type as specified on the command'
                                   ' line and in the configuration files'))
    option_parser.add_option('-t', '--terminate', action='store',
                             default=None, dest='terminate',
                             metavar='JOB_FLOW_ID',
                             help=('Terminate all job flows in the given pool'
                                   ' (defaults to pool "default")'))

    # Scrape options from MRJob and index them by dest
    mr_job = MRJob()
    scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)
    options, args = option_parser.parse_args()

    log_to_stream(name='mrjob', debug=options.verbose)

    runner_kwargs = options.__dict__.copy()
    for non_runner_kwarg in ('quiet', 'verbose', 'list_all', 'find',
                             'terminate'):
        del runner_kwargs[non_runner_kwarg]

    runner = EMRJobRunner(**runner_kwargs)

    if options.list_all:
        pprint_pools(runner)

    if options.find:
        sorted_job_flows = runner.usable_job_flows()

        if sorted_job_flows:
            jf = sorted_job_flows[-1]
            print 'You should use this one:'
            pprint_job_flow(jf)
        else:
            print 'No idle job flows match criteria'

    if options.terminate:
        terminate(runner, options.terminate)
Beispiel #21
0
def make_option_parser():
    usage = '%prog [options]'
    description = (
        'Inspect available job flow pools or identify job flows suitable for'
        ' running a job with the specified options.')
    option_parser = OptionParser(usage=usage, description=description)

    def make_option_group(halp):
        g = OptionGroup(option_parser, halp)
        option_parser.add_option_group(g)
        return g

    ec2_opt_group = make_option_group('EC2 instance configuration')
    hadoop_opt_group = make_option_group('Hadoop configuration')
    job_opt_group = make_option_group('Job flow configuration')

    assignments = {
        option_parser: (
            'conf_paths',
            'emr_job_flow_pool_name',
            'quiet',
            'verbose',
        ),
        ec2_opt_group: (
            'aws_availability_zone',
            'ec2_instance_type',
            'ec2_key_pair',
            'ec2_key_pair_file',
            'ec2_master_instance_type',
            'ec2_core_instance_type',
            'emr_endpoint',
            'num_ec2_instances',
        ),
        hadoop_opt_group: (
            'hadoop_version',
            'label',
            'owner',
        ),
        job_opt_group: (
            'bootstrap_actions',
            'bootstrap_cmds',
            'bootstrap_files',
            'bootstrap_mrjob',
            'bootstrap_python_packages',
        ),
    }

    option_parser.add_option('-a',
                             '--all',
                             action='store_true',
                             default=False,
                             dest='list_all',
                             help=('List all available job flows without'
                                   ' filtering by configuration'))
    option_parser.add_option('-f',
                             '--find',
                             action='store_true',
                             default=False,
                             dest='find',
                             help=('Find a job flow matching the pool name,'
                                   ' bootstrap configuration, and instance'
                                   ' number/type as specified on the command'
                                   ' line and in the configuration files'))
    option_parser.add_option('-t',
                             '--terminate',
                             action='store',
                             default=None,
                             dest='terminate',
                             metavar='JOB_FLOW_ID',
                             help=('Terminate all job flows in the given pool'
                                   ' (defaults to pool "default")'))

    # Scrape options from MRJob and index them by dest
    mr_job = MRJob()
    scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)
    return option_parser
Beispiel #22
0
def make_option_parser():
    usage = 'usage: %prog [options] JOB_FLOW_ID'
    description = (
        'List, display, and parse Hadoop logs associated with EMR job flows.'
        ' Useful for debugging failed jobs for which mrjob did not display a'
        ' useful error message or for inspecting jobs whose output has been'
        ' lost.')

    option_parser = OptionParser(usage=usage, description=description)

    add_basic_opts(option_parser)

    option_parser.add_option('-f',
                             '--find-failure',
                             dest='find_failure',
                             action='store_true',
                             default=False,
                             help=('Search the logs for information about why'
                                   ' the job failed'))
    option_parser.add_option('-l',
                             '--list',
                             dest='list_relevant',
                             action="store_true",
                             default=False,
                             help='List log files MRJob finds relevant')

    option_parser.add_option('-L',
                             '--list-all',
                             dest='list_all',
                             action="store_true",
                             default=False,
                             help='List all log files')

    option_parser.add_option('-a',
                             '--cat',
                             dest='cat_relevant',
                             action="store_true",
                             default=False,
                             help='Cat log files MRJob finds relevant')

    option_parser.add_option('-A',
                             '--cat-all',
                             dest='cat_all',
                             action="store_true",
                             default=False,
                             help='Cat all log files to JOB_FLOW_ID/')

    option_parser.add_option('-s',
                             '--step-num',
                             dest='step_num',
                             action='store',
                             type='int',
                             default=None,
                             help=('Limit results to a single step. To be used'
                                   ' with --list and --cat.'))
    option_parser.add_option('--counters',
                             dest='get_counters',
                             action='store_true',
                             default=False,
                             help='Show counters from the job flow')

    add_emr_connect_opts(option_parser)

    scrape_options_into_new_groups(
        MRJob().all_option_groups(),
        {option_parser: ('ec2_key_pair_file', 's3_sync_wait_time', 'ssh_bin')})

    alphabetize_options(option_parser)

    return option_parser
Beispiel #23
0
def main():
    usage = '%prog [options]'
    description = (
        'Inspect available job flow pools or identify job flows suitable for'
        ' running a job with the specified options.')
    option_parser = OptionParser(usage=usage, description=description)

    import boto.emr.connection
    boto.emr.connection.JobFlow.Fields.add('HadoopVersion')

    def make_option_group(halp):
        g = OptionGroup(option_parser, halp)
        option_parser.add_option_group(g)
        return g

    ec2_opt_group = make_option_group('EC2 instance configuration')
    hadoop_opt_group = make_option_group('Hadoop configuration')
    job_opt_group = make_option_group('Job flow configuration')

    assignments = {
        option_parser: (
            'conf_path',
            'emr_job_flow_pool_name',
            'quiet',
            'verbose',
        ),
        ec2_opt_group: (
            'aws_availability_zone',
            'ec2_instance_type',
            'ec2_key_pair',
            'ec2_key_pair_file',
            'ec2_master_instance_type',
            'ec2_slave_instance_type',
            'emr_endpoint',
            'num_ec2_instances',
        ),
        hadoop_opt_group: (
            'hadoop_version',
            'label',
            'owner',
        ),
        job_opt_group: (
            'bootstrap_actions',
            'bootstrap_cmds',
            'bootstrap_files',
            'bootstrap_mrjob',
            'bootstrap_python_packages',
        ),
    }

    option_parser.add_option('-a', '--all', action='store_true',
                             default=False, dest='list_all',
                             help=('List all available job flows without'
                                   ' filtering by configuration'))
    option_parser.add_option('-f', '--find', action='store_true',
                             default=False, dest='find',
                             help=('Find a job flow matching the pool name,'
                                   ' bootstrap configuration, and instance'
                                   ' number/type as specified on the command'
                                   ' line and in the configuration files'))
    option_parser.add_option('-t', '--terminate', action='store',
                             default=None, dest='terminate',
                             metavar='JOB_FLOW_ID',
                             help=('Terminate all job flows in the given pool'
                                   ' (defaults to pool "default")'))

    # Scrape options from MRJob and index them by dest
    mr_job = MRJob()
    scrape_options_into_new_groups(mr_job.all_option_groups(), assignments)
    options, args = option_parser.parse_args()

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    runner_kwargs = options.__dict__.copy()
    for non_runner_kwarg in ('quiet', 'verbose', 'list_all', 'find',
                             'terminate'):
        del runner_kwargs[non_runner_kwarg]

    runner = EMRJobRunner(**runner_kwargs)

    if options.list_all:
        pprint_pools(runner)

    if options.find:
        sorted_job_flows = runner.usable_job_flows()

        if sorted_job_flows:
            jf = sorted_job_flows[-1]
            print 'You should use this one:'
            pprint_job_flow(jf)
        else:
            print 'No idle job flows match criteria'

    if options.terminate:
        terminate(runner, options.terminate)
Beispiel #24
0
def main():
    usage = 'usage: %prog [options] JOB_FLOW_ID'
    description = (
        'List, display, and parse Hadoop logs associated with EMR job flows.'
        ' Useful for debugging failed jobs for which mrjob did not display a'
        ' useful error message or for inspecting jobs whose output has been'
        ' lost.')

    option_parser = OptionParser(usage=usage, description=description)

    option_parser.add_option('-f', '--find-failure', dest='find_failure',
                             action='store_true', default=False,
                             help=('Search the logs for information about why'
                                   ' the job failed'))
    option_parser.add_option('-l', '--list', dest='list_relevant',
                             action="store_true", default=False,
                             help='List log files MRJob finds relevant')

    option_parser.add_option('-L', '--list-all', dest='list_all',
                             action="store_true", default=False,
                             help='List all log files')

    option_parser.add_option('-a', '--cat', dest='cat_relevant',
                             action="store_true", default=False,
                             help='Cat log files MRJob finds relevant')

    option_parser.add_option('-A', '--cat-all', dest='cat_all',
                             action="store_true", default=False,
                             help='Cat all log files to JOB_FLOW_ID/')

    option_parser.add_option('-s', '--step-num', dest='step_num',
                             action='store', type='int', default=None,
                             help=('Limit results to a single step. To be used'
                                   ' with --list and --cat.'))
    option_parser.add_option('--counters', dest='get_counters',
                             action='store_true', default=False,
                             help='Show counters from the job flow')

    assignments = {
        option_parser: ('conf_path', 'quiet', 'verbose',
                        'ec2_key_pair_file')
    }

    mr_job = MRJob()
    job_option_groups = (mr_job.option_parser, mr_job.mux_opt_group,
                         mr_job.proto_opt_group, mr_job.runner_opt_group,
                         mr_job.hadoop_emr_opt_group, mr_job.emr_opt_group,
                         mr_job.hadoop_opts_opt_group)
    scrape_options_into_new_groups(job_option_groups, assignments)

    options, args = option_parser.parse_args()

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    if options.step_num:
        step_nums = [options.step_num]
    else:
        step_nums = None

    runner_kwargs = options.__dict__.copy()
    for unused_arg in ('quiet', 'verbose', 'list_relevant', 'list_all',
                       'cat_relevant', 'cat_all', 'get_counters', 'step_num',
                      'find_failure'):
        del runner_kwargs[unused_arg]

    with EMRJobRunner(emr_job_flow_id=args[0], **runner_kwargs) as runner:
        if options.list_relevant:
            list_relevant(runner, step_nums)

        if options.list_all:
            list_all(runner)

        if options.cat_relevant:
            cat_relevant(runner, step_nums)

        if options.cat_all:
            cat_all(runner)

        if options.get_counters:
            desc = runner._describe_jobflow()
            runner._set_s3_job_log_uri(desc)
            runner._fetch_counters(
                xrange(1, len(desc.steps) + 1), skip_s3_wait=True)
            runner.print_counters()

        if options.find_failure:
            find_failure(runner, options.step_num)