def test_multistart_monte_carlo_expected_improvement_optimization(self): """Check that multistart optimization (gradient descent) can find the optimum point to sample (using 2-EI).""" numpy.random.seed(7858) # TODO(271): Monte Carlo only works for this seed index = numpy.argmax(numpy.greater_equal(self.num_sampled_list, 20)) domain, gaussian_process = self.gp_test_environments[index] max_num_steps = 75 # this is *too few* steps; we configure it this way so the test will run quickly max_num_restarts = 5 num_steps_averaged = 50 gamma = 0.2 pre_mult = 1.5 max_relative_change = 1.0 tolerance = 3.0e-2 # really large tolerance b/c converging with monte-carlo (esp in Python) is expensive gd_parameters = GradientDescentParameters( max_num_steps, max_num_restarts, num_steps_averaged, gamma, pre_mult, max_relative_change, tolerance, ) num_multistarts = 2 # Expand the domain so that we are definitely not doing constrained optimization expanded_domain = TensorProductDomain([ClosedInterval(-4.0, 2.0)] * self.dim) num_to_sample = 2 repeated_domain = RepeatedDomain(num_to_sample, expanded_domain) num_mc_iterations = 10000 # Just any random point that won't be optimal points_to_sample = repeated_domain.generate_random_point_in_domain() ei_eval = ExpectedImprovement(gaussian_process, points_to_sample, num_mc_iterations=num_mc_iterations) # Compute EI and its gradient for the sake of comparison ei_initial = ei_eval.compute_expected_improvement(force_monte_carlo=True) # TODO(271) Monte Carlo only works for this seed grad_ei_initial = ei_eval.compute_grad_expected_improvement() ei_optimizer = GradientDescentOptimizer(repeated_domain, ei_eval, gd_parameters) best_point = multistart_expected_improvement_optimization(ei_optimizer, num_multistarts, num_to_sample) # Check that gradients are "small" ei_eval.current_point = best_point ei_final = ei_eval.compute_expected_improvement(force_monte_carlo=True) # TODO(271) Monte Carlo only works for this seed grad_ei_final = ei_eval.compute_grad_expected_improvement() self.assert_vector_within_relative(grad_ei_final, numpy.zeros(grad_ei_final.shape), tolerance) # Check that output is in the domain T.assert_equal(repeated_domain.check_point_inside(best_point), True) # Since we didn't really converge to the optimal EI (too costly), do some other sanity checks # EI should have improved T.assert_gt(ei_final, ei_initial) # grad EI should have improved for index in numpy.ndindex(grad_ei_final.shape): T.assert_lt(numpy.fabs(grad_ei_final[index]), numpy.fabs(grad_ei_initial[index]))
def test_big_fork(self): """Tests that we can fork a large number of processes, each of which will wait for a few milliseconds, and return. NOTE: currently fails if you bump 70 up to 200. We're going to fix this very soon. """ time_sleep_s = 0.2 test_time = self.run_big_fork_test(time_sleep_s, 70, 70, 3) print("Big fork performance test: {0:.2f} s (nominal: {1:.2f} s)".format( test_time, time_sleep_s)) T.assert_lt(test_time, time_sleep_s * 2)
def _assert_range(self, x, lower, upper): assert_gt(x, lower) assert_lt(x, upper)
def test__cmp__(self): other_node = node.Node('mocalhost', 'mocal', self.ssh_options) assert_lt(self.node, 'thename') assert_lt(self.node, other_node)
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob(['-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar'] + list(args) + ['-', local_input_path, remote_input_path] + ['--hadoop-input-format', 'FooFormat'] + ['--hadoop-output-format', 'BarFormat'] + ['--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] # don't care that --hadoop-*-format is deprecated with logger_disabled('mrjob.job'): runner = mr_job.make_runner() with runner as runner: # i.e. call cleanup when we're done assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] jar_cmd_args = [args for args in hadoop_cmd_args if args[:1] == ['jar']] assert_equal(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format assert_in('-inputformat', step_0_args) assert_not_in('-outputformat', step_0_args) assert_not_in('-inputformat', step_1_args) assert_in('-outputformat', step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): assert_in('-libjar', args) assert_in('-mapper', args) assert_lt(args.index('-libjar'), args.index('-mapper')) # make sure -jobconf made it through assert_in('-D', step_0_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def _test_end_to_end(self, args=()): # read from STDIN, a local file, and a remote file stdin = StringIO('foo\nbar\n') local_input_path = os.path.join(self.tmp_dir, 'input') with open(local_input_path, 'w') as local_input_file: local_input_file.write('bar\nqux\n') input_to_upload = os.path.join(self.tmp_dir, 'remote_input') with open(input_to_upload, 'w') as input_to_upload_file: input_to_upload_file.write('foo\n') remote_input_path = 'hdfs:///data/foo' check_call([ self.hadoop_bin, 'fs', '-put', input_to_upload, remote_input_path ]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output(['']) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob([ '-r', 'hadoop', '-v', '--no-conf', '--hadoop-arg', '-libjar', '--hadoop-arg', 'containsJars.jar' ] + list(args) + ['-', local_input_path, remote_input_path] + ['--hadoop-input-format', 'FooFormat'] + ['--hadoop-output-format', 'BarFormat'] + ['--jobconf', 'x=y']) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] # don't care that --hadoop-*-format is deprecated with logger_disabled('mrjob.job'): runner = mr_job.make_runner() with runner as runner: # i.e. call cleanup when we're done assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ['MOCK_HDFS_ROOT'] assert_equal(sorted(os.listdir(hdfs_root)), ['data', 'user']) home_dir = os.path.join(hdfs_root, 'user', getpass.getuser()) assert_equal(os.listdir(home_dir), ['tmp']) assert_equal(os.listdir(os.path.join(home_dir, 'tmp')), ['mrjob']) assert_equal(runner._opts['hadoop_extra_args'], ['-libjar', 'containsJars.jar']) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict['path'] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict['name'] pythonpath = runner._get_cmdenv()['PYTHONPATH'] assert_in(mrjob_tar_gz_file_dict['name'], pythonpath.split(':')) assert_equal(sorted(results), [(1, 'qux'), (2, 'bar'), (2, 'foo'), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ['MOCK_HADOOP_LOG']) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] jar_cmd_args = [ args for args in hadoop_cmd_args if args[:1] == ['jar'] ] assert_equal(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format assert_in('-inputformat', step_0_args) assert_not_in('-outputformat', step_0_args) assert_not_in('-inputformat', step_1_args) assert_in('-outputformat', step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): assert_in('-libjar', args) assert_in('-mapper', args) assert_lt(args.index('-libjar'), args.index('-mapper')) # make sure -jobconf made it through assert_in('-D', step_0_args) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))
def test_end_to_end(self): # read from STDIN, a local file, and a remote file stdin = StringIO("foo\nbar\n") local_input_path = os.path.join(self.tmp_dir, "input") with open(local_input_path, "w") as local_input_file: local_input_file.write("bar\nqux\n") input_to_upload = os.path.join(self.tmp_dir, "remote_input") with open(input_to_upload, "w") as input_to_upload_file: input_to_upload_file.write("foo\n") remote_input_path = "hdfs:///data/foo" check_call([self.hadoop_bin, "fs", "-put", input_to_upload, remote_input_path]) # doesn't matter what the intermediate output is; just has to exist. add_mock_hadoop_output([""]) add_mock_hadoop_output(['1\t"qux"\n2\t"bar"\n', '2\t"foo"\n5\tnull\n']) mr_job = MRTwoStepJob( [ "-r", "hadoop", "-v", "--no-conf", "--hadoop-arg", "-libjar", "--hadoop-arg", "containsJars.jar", "-", local_input_path, remote_input_path, "--hadoop-input-format", "FooFormat", "--hadoop-output-format", "BarFormat", ] ) mr_job.sandbox(stdin=stdin) local_tmp_dir = None results = [] with mr_job.make_runner() as runner: assert isinstance(runner, HadoopJobRunner) runner.run() for line in runner.stream_output(): key, value = mr_job.parse_output_line(line) results.append((key, value)) local_tmp_dir = runner._get_local_tmp_dir() # make sure cleanup hasn't happened yet assert os.path.exists(local_tmp_dir) assert any(runner.ls(runner.get_output_dir())) # make sure we're writing to the correct path in HDFS hdfs_root = os.environ["MOCK_HDFS_ROOT"] assert_equal(sorted(os.listdir(hdfs_root)), ["data", "user"]) home_dir = os.path.join(hdfs_root, "user", getpass.getuser()) assert_equal(os.listdir(home_dir), ["tmp"]) assert_equal(os.listdir(os.path.join(home_dir, "tmp")), ["mrjob"]) assert_equal(runner._opts["hadoop_extra_args"], ["-libjar", "containsJars.jar"]) # make sure mrjob.tar.gz is uploaded and in PYTHONPATH assert runner._mrjob_tar_gz_path mrjob_tar_gz_file_dicts = [ file_dict for file_dict in runner._files if file_dict["path"] == runner._mrjob_tar_gz_path ] assert_equal(len(mrjob_tar_gz_file_dicts), 1) mrjob_tar_gz_file_dict = mrjob_tar_gz_file_dicts[0] assert mrjob_tar_gz_file_dict["name"] pythonpath = runner._get_cmdenv()["PYTHONPATH"] assert_in(mrjob_tar_gz_file_dict["name"], pythonpath.split(":")) assert_equal(sorted(results), [(1, "qux"), (2, "bar"), (2, "foo"), (5, None)]) # make sure we called hadoop the way we expected with open(os.environ["MOCK_HADOOP_LOG"]) as mock_log: hadoop_cmd_args = [shlex.split(line) for line in mock_log] jar_cmd_args = [args for args in hadoop_cmd_args if args[:1] == ["jar"]] assert_equal(len(jar_cmd_args), 2) step_0_args, step_1_args = jar_cmd_args # check input/output format assert_in("-inputformat", step_0_args) assert_not_in("-outputformat", step_0_args) assert_not_in("-inputformat", step_1_args) assert_in("-outputformat", step_1_args) # make sure -libjar extra arg comes before -mapper for args in (step_0_args, step_1_args): assert_in("-libjar", args) assert_in("-mapper", args) assert_lt(args.index("-libjar"), args.index("-mapper")) # make sure cleanup happens assert not os.path.exists(local_tmp_dir) assert not any(runner.ls(runner.get_output_dir()))