def reducer(self, n, vars): MRJob.set_status(self, "=============> reducer called") samples_from_mappers = [] counts_from_mappers = [] # First read all the counts from different mappers fo we know the total number of items and we can give # each of the sets coming from different mappers their appropriate weight total_counts_from_mappers = 0 for x in vars: input = json.loads(x) total_counts_from_mappers += input[0] counts_from_mappers.append(input[0]) samples_from_mappers.append(input[1]) # Now based on the number of samples in each mapper we need to select appropriate number of samples form # samples_from_mappers i = 0 for sample_set in samples_from_mappers: weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers number_of_needed_samples = int(round(weight * self.options.sample_size)) for j in range(number_of_needed_samples): yield 1, sample_set.pop() i += 1
def test_cmd_line_options(self): mr_job = MRJob(["--jobconf", "mapred.foo=bar", "--jobconf", "mapred.foo=baz", "--jobconf", "mapred.qux=quux"]) self.assertEqual( mr_job.job_runner_kwargs()["jobconf"], {"mapred.foo": "baz", "mapred.qux": "quux"}, # second option takes priority )
def test_cmd_line_options(self): mr_job = MRJob( ["--partitioner", "java.lang.Object", "--partitioner", "org.apache.hadoop.mapreduce.Partitioner"] ) # second option takes priority self.assertEqual(mr_job.job_runner_kwargs()["partitioner"], "org.apache.hadoop.mapreduce.Partitioner")
def reducer_final(self): MRJob.set_status(self, "=============> reducer final called") for label in self.output: stratum_samples = self.output[label] yield label, (len(stratum_samples), stratum_samples)
def main(cl_args=None): parser = _make_arg_parser() options = parser.parse_args(cl_args) runner_alias = options.runner or _DEFAULT_RUNNER runner_class = _runner_class(runner_alias) if options.help or not options.script_or_jar: _print_help(options, runner_class) sys.exit(0) MRJob.set_up_logging( quiet=options.quiet, verbose=options.verbose, ) kwargs = _get_runner_opt_kwargs(options, runner_class) kwargs.update(_HARD_CODED_OPTS) kwargs['input_paths'] = [os.devnull] step = _get_step(options, parser, cl_args) kwargs['steps'] = [step.description()] runner = runner_class(**kwargs) try: runner.run() finally: runner.cleanup()
def test_spark(self): job = MRJob(["--spark", "input_dir", "output_dir"]) job.spark = MagicMock() job.execute() job.spark.assert_called_once_with("input_dir", "output_dir")
def main(cl_args=None): arg_parser = _make_arg_parser() options = arg_parser.parse_args(cl_args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) # max_hours_idle -> max_mins_idle max_mins_idle = options.max_mins_idle if max_mins_idle is None and options.max_hours_idle is not None: log.warning('--max-hours-idle is deprecated and will be removed' ' in v0.7.0. Please use --max-mins-idle instead.') max_mins_idle = options.max_hours_idle * 60 if options.mins_to_end_of_hour is not None: log.warning('--mins-to-end-of-hour is deprecated as of v0.6.0' ' and does nothing') _maybe_terminate_clusters( dry_run=options.dry_run, max_mins_idle=max_mins_idle, unpooled_only=options.unpooled_only, now=_boto3_now(), pool_name=options.pool_name, pooled_only=options.pooled_only, max_mins_locked=options.max_mins_locked, quiet=options.quiet, **_runner_kwargs(options) )
def test_verbose(self): with patch.object(sys, 'stderr', StringIO()) as stderr: MRJob.set_up_logging(verbose=True) log = logging.getLogger('__main__') log.info('INFO') log.debug('DEBUG') self.assertEqual(stderr.getvalue(), 'INFO\nDEBUG\n')
def test_spark(self): job = MRJob(['--spark', 'input_dir', 'output_dir']) job.spark = MagicMock() job.execute() job.spark.assert_called_once_with('input_dir', 'output_dir')
def main(args=None): now = _boto3_now() arg_parser = _make_arg_parser() options = arg_parser.parse_args(args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) log.info('getting information about running jobs') min_time = timedelta(hours=options.min_hours) emr_client = EMRJobRunner(**_runner_kwargs(options)).make_emr_client() cluster_summaries = _boto3_paginate( 'Clusters', emr_client, 'list_clusters', ClusterStates=['STARTING', 'BOOTSTRAPPING', 'RUNNING']) if not options.exclude: filtered_cluster_summaries = cluster_summaries else: filtered_cluster_summaries = _filter_clusters( cluster_summaries, emr_client, options.exclude) job_info = _find_long_running_jobs( emr_client, filtered_cluster_summaries, min_time, now=now) _print_report(job_info)
def test_deprecated_mapper_final_positional_arg(self): def mapper(k, v): pass def reducer(k, v): pass def mapper_final(): pass stderr = StringIO() with no_handlers_for_logger(): log_to_stream('mrjob.job', stderr) step = MRJob.mr(mapper, reducer, mapper_final) # should be allowed to specify mapper_final as a positional arg, # but we log a warning self.assertEqual( step, MRJob.mr( mapper=mapper, reducer=reducer, mapper_final=mapper_final)) self.assertIn('mapper_final should be specified', stderr.getvalue()) # can't specify mapper_final as a positional and keyword arg self.assertRaises( TypeError, MRJob.mr, mapper, reducer, mapper_final, mapper_final=mapper_final)
def test_bytes_value_protocol(self): job = MRJob() job.OUTPUT_PROTOCOL = BytesValueProtocol self.assertEqual( job.parse_output_line(b'one two\n'), (None, b'one two\n'))
def main(args): # parser command-line args usage = '%prog [options]' description = "Collect EMR stats from active jobflows. " description += "Active jobflows are those in states of: " description += "BOOTSTRAPPING, RUNNING, STARTING, and WAITING. " description += "Collected stats include total number of active jobflows" description += "and total number of Amazon EC2 instances used to execute" description += "these jobflows. The instance counts are not separated by" description += "instance type." option_parser = OptionParser(usage=usage, description=description) option_parser.add_option( "-p", "--pretty-print", action="store_true", dest="pretty_print", default=False, help=('Pretty print the collected stats')) add_basic_opts(option_parser) options, args = option_parser.parse_args(args) if args: option_parser.error('takes no arguments') MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) log.info('collecting EMR active jobflows...') job_flows = collect_active_job_flows(options.conf_paths) log.info('compiling stats from collected jobflows...') stats = job_flows_to_stats(job_flows) if options.pretty_print: pretty_print(stats) else: print(json.dumps(stats))
def __init__(self, *args, **kwargs): MRJob.__init__(self, *args, **kwargs) ## load entities from json file log("loading entity list") entities = json.load(urllib.urlopen("https://s3.amazonaws.com/trec-kba-2012/entity-urlnames.json")) self.entity_representations = toy_kba_algorithm.prepare_entities(entities)
def test_spark_method(self): j = MRJob(["--no-conf"]) j.spark = MagicMock() self.assertEqual(j.steps(), [SparkStep(j.spark)]) self.assertEqual(j._steps_desc(), [dict(type="spark", spark_args=[])])
def test_empty(self): mr_job = MRJob() self.assertEqual(mr_job._runner_kwargs()['hadoop_input_format'], None) self.assertEqual(mr_job._runner_kwargs()['hadoop_output_format'], None)
def mapper_final(self): MRJob.set_status(self, "=============> mapper final called") out = [self.count, self.samples] jOut = json.dumps(out) yield 1, jOut
def reducer(self, n, vars): MRJob.set_status(self, "=============> reducer called") print "reducer:", vars samples_from_mappers = [] counts_from_mappers = [] # First read all the counts from different mappers fo we know the total number of items and we can give # each of the sets coming from different mappers their appropriate weight total_counts_from_mappers = 0 for x in vars: input = json.loads(x) total_counts_from_mappers += input[0] counts_from_mappers.append(input[0]) samples_from_mappers.append(input[1]) # Now based on the number of samples in each mapper we need to select appropriate number of samples form # samples_from_mappers i = 0 fileOut=open(os.path.join(PROJECT_ROOT , 'output.txt'),"w") for sample_set in samples_from_mappers: weight = counts_from_mappers[i] * 1.0 / total_counts_from_mappers number_of_needed_samples = int(round(weight * self.options.sample_size)) for j in range(number_of_needed_samples): fileOut.write(str(sample_set.pop()) + '\n') i += 1 fileOut.close() if False: yield 1,2
def test_mr(self): def mapper(k, v): pass def mapper_init(): pass def mapper_final(): pass def reducer(k, vs): pass def reducer_init(): pass def reducer_final(): pass # make sure it returns the format we currently expect self.assertEqual(MRJob.mr(mapper, reducer), stepdict(mapper, reducer)) self.assertEqual(MRJob.mr(mapper, reducer, mapper_init=mapper_init, mapper_final=mapper_final, reducer_init=reducer_init, reducer_final=reducer_final), stepdict(mapper, reducer, mapper_init=mapper_init, mapper_final=mapper_final, reducer_init=reducer_init, reducer_final=reducer_final)) self.assertEqual(MRJob.mr(mapper), stepdict(mapper))
def test_default_protocol(self): job = MRJob() data = iter([b'1\t2', b'\n{"3": ', b'4}\t"fi', b've"\n']) self.assertEqual( list(job.parse_output(data)), [(1, 2), ({'3': 4}, 'five')])
def test_wrong_type_of_step(self): mr_job = MRJob() mr_job.spark = MagicMock() self.assertRaises(TypeError, mr_job.run_mapper) self.assertRaises(TypeError, mr_job.run_combiner) self.assertRaises(TypeError, mr_job.run_reducer)
def test_default_options(self): with no_handlers_for_logger('__main__'): with patch.object(sys, 'stderr', StringIO()) as stderr: MRJob.set_up_logging() log = logging.getLogger('__main__') log.info('INFO') log.debug('DEBUG') self.assertEqual(stderr.getvalue(), 'INFO\n')
def main(args=None): option_parser = make_option_parser() options = parse_args(option_parser, args) MRJob.set_up_logging(quiet=options.quiet, verbose=options.verbose) with EMRJobRunner(**runner_kwargs(options)) as runner: perform_actions(options, runner)
def test_spark_and_spark_args_methods(self): j = MRJob(["--no-conf"]) j.spark = MagicMock() j.spark_args = MagicMock(return_value=["argh", "ARRRRGH!"]) self.assertEqual(j.steps(), [SparkStep(j.spark, spark_args=["argh", "ARRRRGH!"])]) self.assertEqual(j._steps_desc(), [dict(type="spark", spark_args=["argh", "ARRRRGH!"])])
def test_no_mapper(self): def mapper_final(k, v): pass def reducer(k, vs): pass assert_equal(MRJob.mr(), (_IDENTITY_MAPPER, None)) assert_equal(MRJob.mr(reducer=reducer), (_IDENTITY_MAPPER, reducer)) assert_equal(MRJob.mr(reducer=reducer, mapper_final=mapper_final), ((_IDENTITY_MAPPER, mapper_final), reducer))
def __init__(self, args): lines = [line1.strip() for line1 in sys.stdin] temp_list=list() for i in range(0,len(lines)-1): temp_list.append(lines[i].replace('"', '\\"').strip('\n')) temp_list.append(lines[i+1].replace('"', '\\"').strip('\n')) args=temp_list MRJob.__init__(self, args) yield self,args
def test_default(self): # test parsing JSON mr_job = MRJob() output = '0\t1\n"a"\t"b"\n' mr_job.stdout = StringIO(output) self.assertEqual(mr_job.parse_output(), [(0, 1), ('a', 'b')]) # verify that stdout is not cleared self.assertEqual(mr_job.stdout.getvalue(), output)
def test_libjars_attr_relative_path(self): job_dir = os.path.dirname(MRJob.mr_job_script()) with patch.object(MRJob, 'LIBJARS', ['cookie.jar', '/left/dora.jar']): job = MRJob() self.assertEqual( job._runner_kwargs()['libjars'], [os.path.join(job_dir, 'cookie.jar'), '/left/dora.jar'])
def test_deprecation_warning(self): job = MRJob() job.parse_output_line(b'1\t2\n') self.assertEqual(self.log.warning.call_count, 1) # only warn once job.parse_output_line(b'3\t4\n') self.assertEqual(self.log.warning.call_count, 1)
def test_cmd_line_options(self): mr_job = MRJob([ '--partitioner', 'java.lang.Object', '--partitioner', 'org.apache.hadoop.mapreduce.Partitioner' ]) # second option takes priority self.assertEqual(mr_job.job_runner_kwargs()['partitioner'], 'org.apache.hadoop.mapreduce.Partitioner')
def make_option_parser(): usage = 'usage: %prog [options] JOB_FLOW_ID' description = ( 'List, display, and parse Hadoop logs associated with EMR job flows.' ' Useful for debugging failed jobs for which mrjob did not display a' ' useful error message or for inspecting jobs whose output has been' ' lost.') option_parser = OptionParser(usage=usage, description=description) add_basic_opts(option_parser) option_parser.add_option('-f', '--find-failure', dest='find_failure', action='store_true', default=False, help=('Search the logs for information about why' ' the job failed')) option_parser.add_option('-l', '--list', dest='list_relevant', action="store_true", default=False, help='List log files MRJob finds relevant') option_parser.add_option('-L', '--list-all', dest='list_all', action="store_true", default=False, help='List all log files') option_parser.add_option('-a', '--cat', dest='cat_relevant', action="store_true", default=False, help='Cat log files MRJob finds relevant') option_parser.add_option('-A', '--cat-all', dest='cat_all', action="store_true", default=False, help='Cat all log files to JOB_FLOW_ID/') option_parser.add_option('-s', '--step-num', dest='step_num', action='store', type='int', default=None, help=('Limit results to a single step. To be used' ' with --list and --cat.')) option_parser.add_option('--counters', dest='get_counters', action='store_true', default=False, help='Show counters from the job flow') add_emr_connect_opts(option_parser) scrape_options_into_new_groups( MRJob().all_option_groups(), {option_parser: ('ec2_key_pair_file', 's3_sync_wait_time', 'ssh_bin')}) alphabetize_options(option_parser) return option_parser
def test_spark_and_streaming_dont_mix(self): j = MRJob(['--no-conf']) j.mapper = MagicMock() j.spark = MagicMock() self.assertRaises(ValueError, j.steps)