def reducer(self, n, vars):
        MRJob.set_status(self, "=============>  reducer called")

        samples_from_mappers = []
        counts_from_mappers = []

        # First read all the counts from different mappers fo we know the total number of items and we can give
        # each of the sets coming from different mappers their appropriate weight
        total_counts_from_mappers = 0

        for x in vars:
            input = json.loads(x)
            total_counts_from_mappers += input[0]

            counts_from_mappers.append(input[0])
            samples_from_mappers.append(input[1])

        # Now based on the number of samples in each mapper we need to select appropriate number of samples form
        # samples_from_mappers
        i = 0
        for sample_set in samples_from_mappers:
            weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
            number_of_needed_samples = int(round(weight * self.options.sample_size))

            for j in range(number_of_needed_samples):
                yield 1, sample_set.pop()

            i += 1
Exemple #2
1
    def test_cmd_line_options(self):
        mr_job = MRJob(["--jobconf", "mapred.foo=bar", "--jobconf", "mapred.foo=baz", "--jobconf", "mapred.qux=quux"])

        self.assertEqual(
            mr_job.job_runner_kwargs()["jobconf"],
            {"mapred.foo": "baz", "mapred.qux": "quux"},  # second option takes priority
        )
Exemple #3
0
    def test_cmd_line_options(self):
        mr_job = MRJob(
            ["--partitioner", "java.lang.Object", "--partitioner", "org.apache.hadoop.mapreduce.Partitioner"]
        )

        # second option takes priority
        self.assertEqual(mr_job.job_runner_kwargs()["partitioner"], "org.apache.hadoop.mapreduce.Partitioner")
    def reducer_final(self):

        MRJob.set_status(self, "=============>  reducer final called")

        for label in self.output:
            stratum_samples = self.output[label]
            yield label, (len(stratum_samples), stratum_samples)
Exemple #5
0
def main(cl_args=None):
    parser = _make_arg_parser()
    options = parser.parse_args(cl_args)

    runner_alias = options.runner or _DEFAULT_RUNNER
    runner_class = _runner_class(runner_alias)

    if options.help or not options.script_or_jar:
        _print_help(options, runner_class)
        sys.exit(0)

    MRJob.set_up_logging(
        quiet=options.quiet,
        verbose=options.verbose,
    )

    kwargs = _get_runner_opt_kwargs(options, runner_class)
    kwargs.update(_HARD_CODED_OPTS)

    kwargs['input_paths'] = [os.devnull]

    step = _get_step(options, parser, cl_args)
    kwargs['steps'] = [step.description()]

    runner = runner_class(**kwargs)

    try:
        runner.run()
    finally:
        runner.cleanup()
Exemple #6
0
    def test_spark(self):
        job = MRJob(["--spark", "input_dir", "output_dir"])
        job.spark = MagicMock()

        job.execute()

        job.spark.assert_called_once_with("input_dir", "output_dir")
def main(cl_args=None):
    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(cl_args)

    MRJob.set_up_logging(quiet=options.quiet,
                         verbose=options.verbose)

    # max_hours_idle -> max_mins_idle
    max_mins_idle = options.max_mins_idle
    if max_mins_idle is None and options.max_hours_idle is not None:
        log.warning('--max-hours-idle is deprecated and will be removed'
                    ' in v0.7.0. Please use --max-mins-idle instead.')
        max_mins_idle = options.max_hours_idle * 60

    if options.mins_to_end_of_hour is not None:
        log.warning('--mins-to-end-of-hour is deprecated as of v0.6.0'
                    ' and does nothing')

    _maybe_terminate_clusters(
        dry_run=options.dry_run,
        max_mins_idle=max_mins_idle,
        unpooled_only=options.unpooled_only,
        now=_boto3_now(),
        pool_name=options.pool_name,
        pooled_only=options.pooled_only,
        max_mins_locked=options.max_mins_locked,
        quiet=options.quiet,
        **_runner_kwargs(options)
    )
Exemple #8
0
 def test_verbose(self):
     with patch.object(sys, 'stderr', StringIO()) as stderr:
         MRJob.set_up_logging(verbose=True)
         log = logging.getLogger('__main__')
         log.info('INFO')
         log.debug('DEBUG')
         self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
Exemple #9
0
    def test_spark(self):
        job = MRJob(['--spark', 'input_dir', 'output_dir'])
        job.spark = MagicMock()

        job.execute()

        job.spark.assert_called_once_with('input_dir', 'output_dir')
Exemple #10
0
def main(args=None):
    now = _boto3_now()

    arg_parser = _make_arg_parser()
    options = arg_parser.parse_args(args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    log.info('getting information about running jobs')

    min_time = timedelta(hours=options.min_hours)

    emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client()
    cluster_summaries = _boto3_paginate(
        'Clusters', emr_client, 'list_clusters',
        ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING'])

    if not options.exclude:
        filtered_cluster_summaries = cluster_summaries
    else:
        filtered_cluster_summaries = _filter_clusters(
            cluster_summaries, emr_client, options.exclude)

    job_info = _find_long_running_jobs(
        emr_client, filtered_cluster_summaries, min_time, now=now)

    _print_report(job_info)
Exemple #11
0
    def test_deprecated_mapper_final_positional_arg(self):
        def mapper(k, v):
            pass

        def reducer(k, v):
            pass

        def mapper_final():
            pass

        stderr = StringIO()
        with no_handlers_for_logger():
            log_to_stream('mrjob.job', stderr)
            step = MRJob.mr(mapper, reducer, mapper_final)

        # should be allowed to specify mapper_final as a positional arg,
        # but we log a warning
        self.assertEqual(
            step,
            MRJob.mr(
                mapper=mapper, reducer=reducer, mapper_final=mapper_final))
        self.assertIn('mapper_final should be specified', stderr.getvalue())

        # can't specify mapper_final as a positional and keyword arg
        self.assertRaises(
            TypeError,
            MRJob.mr,
            mapper,
            reducer,
            mapper_final,
            mapper_final=mapper_final)
Exemple #12
0
    def test_bytes_value_protocol(self):
        job = MRJob()
        job.OUTPUT_PROTOCOL = BytesValueProtocol

        self.assertEqual(
            job.parse_output_line(b'one two\n'),
            (None, b'one two\n'))
Exemple #13
0
def main(args):
    # parser command-line args
    usage = '%prog [options]'
    description = "Collect EMR stats from active jobflows. "
    description += "Active jobflows are those in states of: "
    description += "BOOTSTRAPPING, RUNNING, STARTING, and WAITING. "
    description += "Collected stats include total number of active jobflows"
    description += "and total number of Amazon EC2 instances used to execute"
    description += "these jobflows. The instance counts are not separated by"
    description += "instance type."
    option_parser = OptionParser(usage=usage, description=description)
    option_parser.add_option(
        "-p", "--pretty-print",
        action="store_true", dest="pretty_print", default=False,
        help=('Pretty print the collected stats'))
    add_basic_opts(option_parser)

    options, args = option_parser.parse_args(args)
    if args:
        option_parser.error('takes no arguments')

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)
    log.info('collecting EMR active jobflows...')
    job_flows = collect_active_job_flows(options.conf_paths)
    log.info('compiling stats from collected jobflows...')
    stats = job_flows_to_stats(job_flows)

    if options.pretty_print:
        pretty_print(stats)
    else:
        print(json.dumps(stats))
Exemple #14
0
    def __init__(self, *args, **kwargs):
        MRJob.__init__(self, *args, **kwargs)

        ## load entities from json file
        log("loading entity list")
        entities = json.load(urllib.urlopen("https://s3.amazonaws.com/trec-kba-2012/entity-urlnames.json"))
        self.entity_representations = toy_kba_algorithm.prepare_entities(entities)
Exemple #15
0
    def test_spark_method(self):
        j = MRJob(["--no-conf"])
        j.spark = MagicMock()

        self.assertEqual(j.steps(), [SparkStep(j.spark)])

        self.assertEqual(j._steps_desc(), [dict(type="spark", spark_args=[])])
Exemple #16
0
    def test_empty(self):
        mr_job = MRJob()

        self.assertEqual(mr_job._runner_kwargs()['hadoop_input_format'],
                         None)
        self.assertEqual(mr_job._runner_kwargs()['hadoop_output_format'],
                         None)
Exemple #17
0
    def mapper_final(self):
        MRJob.set_status(self, "=============>  mapper final called")

        out = [self.count, self.samples]
        jOut = json.dumps(out)

        yield 1, jOut
Exemple #18
0
    def reducer(self, n, vars):
        MRJob.set_status(self, "=============>  reducer called")

        print "reducer:", vars
        samples_from_mappers = []
        counts_from_mappers = []

        # First read all the counts from different mappers fo we know the total number of items and we can give
        # each of the sets coming from different mappers their appropriate weight
        total_counts_from_mappers = 0

        for x in vars:
            input = json.loads(x)
            total_counts_from_mappers += input[0]

            counts_from_mappers.append(input[0])
            samples_from_mappers.append(input[1])

        # Now based on the number of samples in each mapper we need to select appropriate number of samples form
        # samples_from_mappers
        i = 0

        fileOut=open(os.path.join(PROJECT_ROOT , 'output.txt'),"w")

        for sample_set in samples_from_mappers:
            weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers
            number_of_needed_samples = int(round(weight * self.options.sample_size))

            for j in range(number_of_needed_samples):
                fileOut.write(str(sample_set.pop()) + '\n')


            i += 1
        fileOut.close()
        if False: yield 1,2
Exemple #19
0
    def test_mr(self):

        def mapper(k, v):
            pass

        def mapper_init():
            pass

        def mapper_final():
            pass

        def reducer(k, vs):
            pass

        def reducer_init():
            pass

        def reducer_final():
            pass

        # make sure it returns the format we currently expect
        self.assertEqual(MRJob.mr(mapper, reducer),
                         stepdict(mapper, reducer))
        self.assertEqual(MRJob.mr(mapper, reducer,
                                  mapper_init=mapper_init,
                                  mapper_final=mapper_final,
                                  reducer_init=reducer_init,
                                  reducer_final=reducer_final),
                         stepdict(mapper, reducer,
                                  mapper_init=mapper_init,
                                  mapper_final=mapper_final,
                                  reducer_init=reducer_init,
                                  reducer_final=reducer_final))
        self.assertEqual(MRJob.mr(mapper),
                         stepdict(mapper))
Exemple #20
0
    def test_default_protocol(self):
        job = MRJob()

        data = iter([b'1\t2', b'\n{"3": ', b'4}\t"fi', b've"\n'])
        self.assertEqual(
            list(job.parse_output(data)),
            [(1, 2), ({'3': 4}, 'five')])
Exemple #21
0
    def test_wrong_type_of_step(self):
        mr_job = MRJob()
        mr_job.spark = MagicMock()

        self.assertRaises(TypeError, mr_job.run_mapper)
        self.assertRaises(TypeError, mr_job.run_combiner)
        self.assertRaises(TypeError, mr_job.run_reducer)
Exemple #22
0
 def test_default_options(self):
     with no_handlers_for_logger('__main__'):
         with patch.object(sys, 'stderr', StringIO()) as stderr:
             MRJob.set_up_logging()
             log = logging.getLogger('__main__')
             log.info('INFO')
             log.debug('DEBUG')
             self.assertEqual(stderr.getvalue(), 'INFO\n')
Exemple #23
0
def main(args=None):
    option_parser = make_option_parser()
    options = parse_args(option_parser, args)

    MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose)

    with EMRJobRunner(**runner_kwargs(options)) as runner:
        perform_actions(options, runner)
Exemple #24
0
    def test_spark_and_spark_args_methods(self):
        j = MRJob(["--no-conf"])
        j.spark = MagicMock()
        j.spark_args = MagicMock(return_value=["argh", "ARRRRGH!"])

        self.assertEqual(j.steps(), [SparkStep(j.spark, spark_args=["argh", "ARRRRGH!"])])

        self.assertEqual(j._steps_desc(), [dict(type="spark", spark_args=["argh", "ARRRRGH!"])])
Exemple #25
0
    def test_no_mapper(self):
        def mapper_final(k, v): pass
        def reducer(k, vs): pass

        assert_equal(MRJob.mr(), (_IDENTITY_MAPPER, None))
        assert_equal(MRJob.mr(reducer=reducer), (_IDENTITY_MAPPER, reducer))
        assert_equal(MRJob.mr(reducer=reducer, mapper_final=mapper_final),
                     ((_IDENTITY_MAPPER, mapper_final), reducer))
    def __init__(self, args):
	 lines = [line1.strip() for line1 in sys.stdin]
	 temp_list=list()
	 for i in range(0,len(lines)-1):
	  temp_list.append(lines[i].replace('"', '\\"').strip('\n'))
	  temp_list.append(lines[i+1].replace('"', '\\"').strip('\n'))
	 args=temp_list
	 MRJob.__init__(self, args)
         yield self,args
Exemple #27
0
    def test_default(self):
        # test parsing JSON
        mr_job = MRJob()
        output = '0\t1\n"a"\t"b"\n'
        mr_job.stdout = StringIO(output)
        self.assertEqual(mr_job.parse_output(), [(0, 1), ('a', 'b')])

        # verify that stdout is not cleared
        self.assertEqual(mr_job.stdout.getvalue(), output)
Exemple #28
0
    def test_libjars_attr_relative_path(self):
        job_dir = os.path.dirname(MRJob.mr_job_script())

        with patch.object(MRJob, 'LIBJARS', ['cookie.jar', '/left/dora.jar']):
            job = MRJob()

            self.assertEqual(
                job._runner_kwargs()['libjars'],
                [os.path.join(job_dir, 'cookie.jar'), '/left/dora.jar'])
Exemple #29
0
    def test_deprecation_warning(self):
        job = MRJob()

        job.parse_output_line(b'1\t2\n')
        self.assertEqual(self.log.warning.call_count, 1)

        # only warn once
        job.parse_output_line(b'3\t4\n')
        self.assertEqual(self.log.warning.call_count, 1)
Exemple #30
0
    def test_cmd_line_options(self):
        mr_job = MRJob([
            '--partitioner', 'java.lang.Object',
            '--partitioner', 'org.apache.hadoop.mapreduce.Partitioner'
        ])

        # second option takes priority
        self.assertEqual(mr_job.job_runner_kwargs()['partitioner'],
                         'org.apache.hadoop.mapreduce.Partitioner')
Exemple #31
0
def make_option_parser():
    usage = 'usage: %prog [options] JOB_FLOW_ID'
    description = (
        'List, display, and parse Hadoop logs associated with EMR job flows.'
        ' Useful for debugging failed jobs for which mrjob did not display a'
        ' useful error message or for inspecting jobs whose output has been'
        ' lost.')

    option_parser = OptionParser(usage=usage, description=description)

    add_basic_opts(option_parser)

    option_parser.add_option('-f',
                             '--find-failure',
                             dest='find_failure',
                             action='store_true',
                             default=False,
                             help=('Search the logs for information about why'
                                   ' the job failed'))
    option_parser.add_option('-l',
                             '--list',
                             dest='list_relevant',
                             action="store_true",
                             default=False,
                             help='List log files MRJob finds relevant')

    option_parser.add_option('-L',
                             '--list-all',
                             dest='list_all',
                             action="store_true",
                             default=False,
                             help='List all log files')

    option_parser.add_option('-a',
                             '--cat',
                             dest='cat_relevant',
                             action="store_true",
                             default=False,
                             help='Cat log files MRJob finds relevant')

    option_parser.add_option('-A',
                             '--cat-all',
                             dest='cat_all',
                             action="store_true",
                             default=False,
                             help='Cat all log files to JOB_FLOW_ID/')

    option_parser.add_option('-s',
                             '--step-num',
                             dest='step_num',
                             action='store',
                             type='int',
                             default=None,
                             help=('Limit results to a single step. To be used'
                                   ' with --list and --cat.'))
    option_parser.add_option('--counters',
                             dest='get_counters',
                             action='store_true',
                             default=False,
                             help='Show counters from the job flow')

    add_emr_connect_opts(option_parser)

    scrape_options_into_new_groups(
        MRJob().all_option_groups(),
        {option_parser: ('ec2_key_pair_file', 's3_sync_wait_time', 'ssh_bin')})

    alphabetize_options(option_parser)

    return option_parser
Exemple #32
0
    def test_spark_and_streaming_dont_mix(self):
        j = MRJob(['--no-conf'])
        j.mapper = MagicMock()
        j.spark = MagicMock()

        self.assertRaises(ValueError, j.steps)