def main(argv=None): parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = 'pterasort' args.module = 'pterasort' args.do_not_use_java_record_reader = True args.do_not_use_java_record_writer = True bp_filename = Partitioner.initialize_break_points(args.num_reducers, args.sampled_records, args.input, args.num_threads) args.upload_file_to_cache = ['pterasort.py', 'ioformats.py', bp_filename] submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.run()
def main(argv=None): parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = 'pteragen' args.module = 'pteragen' args.upload_file_to_cache = ['pteragen.py', 'ioformats.py'] args.input_format = 'it.crs4.pydoop.examples.pterasort.RangeInputFormat' args.do_not_use_java_record_writer = True # args.libjars = ['pydoop-input-formats.jar'] add_D_arg(args, 'num_records', NUM_ROWS_KEY) add_D_arg(args, 'num_maps', NUM_MAPS_KEY) args.num_reducers = 0 submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.run()
def main(argv=None): parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = 'pteracheck' args.module = 'pteracheck' args.do_not_use_java_record_reader = True args.do_not_use_java_record_writer = False args.num_reducers = 1 args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py'] submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.run() path = os.path.join(args.output, 'part-r-00000') with hdfs.open(path, 'rb') as f: data = f.read() check_rows(data.split(b'\n')[:-1])
def run_mapred(model, input_dirs, output_dir, nmaps, log_level, collate=False): wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE)) shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE) if nmaps > len(input_dirs): nmaps = len(input_dirs) LOGGER.warn("Not enough input dirs, will only do %d splits" % nmaps) splits = common.balanced_split(input_dirs, nmaps) splits_uri = "pydoop_splits_%s" % uuid.uuid4().hex with hdfs.open(splits_uri, 'wb') as f: write_opaques([OpaqueInputSplit(1, _) for _ in splits], f) submitter = PydoopSubmitter() properties = { common.GRAPH_ARCH_KEY: model.name, common.LOG_LEVEL_KEY: log_level, common.NUM_MAPS_KEY: nmaps, common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_uri, } submitter.set_args( argparse.Namespace( D=list(properties.items()), avro_input=None, avro_output=None, cache_archive=None, cache_file=None, disable_property_name_conversion=True, do_not_use_java_record_reader=True, do_not_use_java_record_writer=True, entry_point="__main__", hadoop_conf=None, input=input_dirs[0], # does it matter? input_format=None, job_conf=None, job_name="dump_weights", keep_wd=False, libjars=None, log_level=log_level, module=os.path.splitext(os.path.basename(__file__))[0], no_override_env=False, no_override_home=False, no_override_ld_path=False, no_override_path=False, no_override_pypath=False, num_reducers=0, output=output_dir, output_format=None, pretend=False, pstats_dir=None, python_program=sys.executable, python_zip=[zip_fn], set_env=None, upload_archive_to_cache=None, upload_file_to_cache=[__file__], )) submitter.run() hdfs.rmr(splits_uri) if collate: collate_mapred_output(output_dir) shutil.rmtree(wd)
def main(argv=None): os.chdir(os.path.dirname(os.path.abspath(__file__))) wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE)) shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE) parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = WORKER args.module = WORKER args.upload_file_to_cache = ['%s.py' % WORKER] args.python_zip = [zip_fn] args.do_not_use_java_record_reader = True args.num_reducers = 0 if args.seed: LOGGER.info("setting random seed to %d", args.seed) random.seed(args.seed) model = models.get_model_info(args.architecture) graph = model.load_prep() bneck_tensor = model.get_bottleneck(graph) bneck_store = ioformats.BottleneckStore( bneck_tensor.shape[1].value, bneck_tensor.dtype ) bneck_map = bneck_store.build_map(args.input) LOGGER.info("%d subdirs, %r bottlenecks" % (len(bneck_map), [len(_) for _ in bneck_map.values()])) splits_path = os.path.join(args.input, '_' + uuid.uuid4().hex) generate_input_splits(args.num_maps, bneck_map, splits_path) submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.properties.update({ common.BNECKS_DIR_KEY: args.input, common.EVAL_STEP_INTERVAL_KEY: args.eval_step_interval, common.GRAPH_ARCH_KEY: args.architecture, common.LEARNING_RATE_KEY: args.learning_rate, common.LOG_LEVEL_KEY: args.log_level, common.NUM_MAPS_KEY: args.num_maps, common.NUM_STEPS_KEY: args.num_steps, common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_path, common.TRAIN_BATCH_SIZE_KEY: args.train_batch_size, common.VALIDATION_BATCH_SIZE_KEY: args.validation_batch_size, common.VALIDATION_PERCENT_KEY: args.validation_percent, }) if args.seed: submitter.properties[common.SEED_KEY] = args.seed submitter.run() hdfs.rmr(splits_path) shutil.rmtree(wd)
def main(argv=None): os.chdir(os.path.dirname(os.path.abspath(__file__))) wd = tempfile.mkdtemp(prefix="pydeep_") zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE)) shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE) parser = make_parser() args, unknown_args = parser.parse_known_args(argv) args.job_name = WORKER args.module = WORKER args.upload_file_to_cache = ['%s.py' % WORKER] args.python_zip = [zip_fn] args.do_not_use_java_record_reader = True args.do_not_use_java_record_writer = True args.num_reducers = 0 LOGGER.setLevel(args.log_level) model = get_model_info(args.architecture) get_graph(model, log_level=args.log_level) images = list_images(args.input) splits = common.balanced_split(images, args.num_maps) uri = os.path.join(args.input, '_' + uuid.uuid4().hex) LOGGER.debug("saving input splits to: %s", uri) with hdfs.open(uri, 'wb') as f: write_opaques([OpaqueInputSplit(1, _) for _ in splits], f) submitter = PydoopSubmitter() submitter.set_args(args, [] if unknown_args is None else unknown_args) submitter.properties.update({ common.NUM_MAPS_KEY: args.num_maps, common.GRAPH_ARCH_KEY: args.architecture, common.PYDOOP_EXTERNALSPLITS_URI_KEY: uri, }) submitter.run() hdfs.rmr(uri) shutil.rmtree(wd)
def setUp(self): self.submitter = PydoopSubmitter()
class TestAppSubmit(unittest.TestCase): def setUp(self): self.submitter = PydoopSubmitter() @staticmethod def _gen_default_args(): return Args( entry_point='__main__', log_level='INFO', module='the_module', no_override_env=False, no_override_home=False, python_program='python', output="output_path", job_name="job_name", num_reducers=0, ) def test_help(self): parser = app.make_parser() # silence! for k in ['submit', 'script']: parser._actions[2].choices[k].format_help = nop parser._actions[2].choices[k].format_usage = nop parser._actions[2].choices[k].error = nop parser.format_help = nop parser.format_usage = nop parser.error = nop try: args, unk = parser.parse_known_args(['-h']) except SystemExit as e: self.assertEqual(e.message, 0) try: args, unk = parser.parse_known_args(['submit', '-h']) except SystemExit as e: self.assertEqual(e.message, 0) try: args, unk = parser.parse_known_args(['submit']) except SystemExit as e: self.assertEqual(e.message, 2) def _check_args(self, args, args_kv): for k, v in args_kv: k = re.sub("^--", "", k).replace('-', '_') self.assertTrue(hasattr(args, k)) v1 = getattr(args, k) if v is None: self.assertEqual(v1, True) elif type(v1) is list: pass else: self.assertEqual(v1, v) def test_conf_file(self): wd = tempfile.mkdtemp(prefix='pydoop_') conf_file = os.path.join(wd, 'pydoop.conf') args_kv = (("--pretend", None), ("--mrv2", None), ("--input-format", 'mapreduce.lib.input.TextInputFormat'), ("--output-format", 'mapreduce.lib.input.TextOutputFormat'), ("--num-reducers", 10), ("--python-zip", 'allmymodules.zip'), ) try: with open(conf_file, 'w') as cf: d = ''.join(['{}\n{}\n'.format(k, v) if v is not None else '{}\n'.format(k) for (k, v) in args_kv]) cf.write(d) parser = app.make_parser() parser.format_help = nop module = 'mymod1.mod2.mod3' ainput = 'input' aoutput = 'output' argv = ['submit', module, ainput, aoutput, '@' + conf_file] [args, unknown] = parser.parse_known_args(argv) self.assertEqual(args.module, module) self.assertEqual(args.input, ainput) self.assertEqual(args.output, aoutput) self.assertEqual(len(unknown), 0) self._check_args(args, args_kv) finally: shutil.rmtree(wd) def test_empty_param(self): parser = app.make_parser() parser.format_help = nop program = 'program' ainput = 'input' aoutput = 'output' argv = ['submit', '--module', '', program, ainput, aoutput] [args, unknown] = parser.parse_known_args(argv) self.assertEqual(args.module, '') def test_generate_pipes_code_env(self): args = self._gen_default_args() self.submitter.set_args(args) old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '') try: # we set this variable for this test since it may not be set in # the environment os.environ['LD_LIBRARY_PATH'] = '/test_path' code = self.submitter._generate_pipes_code() self.assertTrue('export PATH=' in code) self.assertTrue('export PYTHONPATH=' in code) self.assertTrue('export LD_LIBRARY_PATH="/test_path"' in code) finally: os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path def test_generate_pipes_code_no_override_ld_path(self): args = self._gen_default_args() args.no_override_ld_path = True self.submitter.set_args(args) old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '') try: os.environ['LD_LIBRARY_PATH'] = '/test_path' code = self.submitter._generate_pipes_code() self.assertTrue('export PYTHONPATH=' in code) self.assertFalse('export LD_LIBRARY_PATH=' in code) finally: os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path def test_generate_pipes_code_no_override_path(self): args = self._gen_default_args() args.no_override_path = True self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertTrue('export PYTHONPATH=' in code) self.assertFalse('export PATH=' in code) def test_generate_pipes_code_no_override_pythonpath(self): args = self._gen_default_args() args.no_override_pypath = True self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertTrue('export PYTHONPATH="${PWD}:${PYTHONPATH}"' in code) self.assertTrue('export PATH=' in code) def test_generate_pipes_code_with_set_env(self): args = self._gen_default_args() args.set_env = ["PATH=/my/custom/path"] self.submitter.set_args(args) old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '') try: os.environ['LD_LIBRARY_PATH'] = '/test_path' code = self.submitter._generate_pipes_code() self.assertTrue('export PATH="/my/custom/path"' in code) self.assertTrue('export PYTHONPATH=' in code) self.assertTrue('export LD_LIBRARY_PATH="/test_path"' in code) finally: os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path def test_generate_code_no_env_override(self): args = self._gen_default_args() args.no_override_env = True self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertFalse('export PATH=' in code) self.assertFalse('export LD_LIBRARY_PATH="/test_path"' in code) # PYTHONPATH should still be there because we add the hadoop working directory self.assertTrue('export PYTHONPATH=' in code) def test_generate_code_no_env_override_with_set_env(self): args = self._gen_default_args() args.no_override_env = True args.set_env = ["PATH=/my/custom/path"] self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertTrue('export PATH="/my/custom/path"' in code) self.assertFalse('export LD_LIBRARY_PATH="/test_path"' in code) # PYTHONPATH should still be there because we add the hadoop working directory self.assertTrue('export PYTHONPATH=' in code) def test_env_arg_to_dict(self): env_arg = [ 'var1=value1', ' var2 = value2 ', 'var3 = str with = sign' ] d = self.submitter._env_arg_to_dict(env_arg) self.assertEquals('value1', d['var1']) self.assertEquals('value2', d['var2']) self.assertEquals('str with = sign', d['var3'])
class TestAppSubmit(unittest.TestCase): def setUp(self): self.submitter = PydoopSubmitter() @staticmethod def _gen_default_args(): return Args( entry_point='__main__', log_level='INFO', module='the_module', no_override_env=False, no_override_home=False, python_program='python', output="output_path", job_name="job_name", num_reducers=0, ) def test_help(self): parser = app.make_parser() # silence! for k in ['submit', 'script']: parser._actions[2].choices[k].format_help = nop parser._actions[2].choices[k].format_usage = nop parser._actions[2].choices[k].error = nop parser.format_help = nop parser.format_usage = nop parser.error = nop try: args, unk = parser.parse_known_args(['-h']) except SystemExit as e: self.assertEqual(e.message, 0) try: args, unk = parser.parse_known_args(['submit', '-h']) except SystemExit as e: self.assertEqual(e.message, 0) try: args, unk = parser.parse_known_args(['submit']) except SystemExit as e: self.assertEqual(e.message, 2) def _check_args(self, args, args_kv): for k, v in args_kv: k = re.sub("^--", "", k).replace('-', '_') self.assertTrue(hasattr(args, k)) v1 = getattr(args, k) if v is None: self.assertEqual(v1, True) elif type(v1) is list: pass else: self.assertEqual(v1, v) def test_conf_file(self): wd = tempfile.mkdtemp(prefix='pydoop_') conf_file = os.path.join(wd, 'pydoop.conf') args_kv = ( ("--pretend", None), ("--mrv2", None), ("--input-format", 'mapreduce.lib.input.TextInputFormat'), ("--output-format", 'mapreduce.lib.input.TextOutputFormat'), ("--num-reducers", 10), ("--python-zip", 'allmymodules.zip'), ) try: with open(conf_file, 'w') as cf: d = ''.join([ '{}\n{}\n'.format(k, v) if v is not None else '{}\n'.format(k) for (k, v) in args_kv ]) cf.write(d) parser = app.make_parser() parser.format_help = nop module = 'mymod1.mod2.mod3' ainput = 'input' aoutput = 'output' argv = ['submit', module, ainput, aoutput, '@' + conf_file] [args, unknown] = parser.parse_known_args(argv) self.assertEqual(args.module, module) self.assertEqual(args.input, ainput) self.assertEqual(args.output, aoutput) self.assertEqual(len(unknown), 0) self._check_args(args, args_kv) finally: shutil.rmtree(wd) def test_empty_param(self): parser = app.make_parser() parser.format_help = nop program = 'program' ainput = 'input' aoutput = 'output' argv = ['submit', '--module', '', program, ainput, aoutput] [args, unknown] = parser.parse_known_args(argv) self.assertEqual(args.module, '') def test_generate_pipes_code_env(self): args = self._gen_default_args() self.submitter.set_args(args) old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '') try: # we set this variable for this test since it may not be set in # the environment os.environ['LD_LIBRARY_PATH'] = '/test_path' code = self.submitter._generate_pipes_code() self.assertTrue('export PATH=' in code) self.assertTrue('export PYTHONPATH=' in code) self.assertTrue('export LD_LIBRARY_PATH="/test_path"' in code) finally: os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path def test_generate_pipes_code_no_override_ld_path(self): args = self._gen_default_args() args.no_override_ld_path = True self.submitter.set_args(args) old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '') try: os.environ['LD_LIBRARY_PATH'] = '/test_path' code = self.submitter._generate_pipes_code() self.assertTrue('export PYTHONPATH=' in code) self.assertFalse('export LD_LIBRARY_PATH=' in code) finally: os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path def test_generate_pipes_code_no_override_path(self): args = self._gen_default_args() args.no_override_path = True self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertTrue('export PYTHONPATH=' in code) self.assertFalse('export PATH=' in code) def test_generate_pipes_code_no_override_pythonpath(self): args = self._gen_default_args() args.no_override_pypath = True self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertTrue('export PYTHONPATH="${PWD}:${PYTHONPATH}"' in code) self.assertTrue('export PATH=' in code) def test_generate_pipes_code_with_set_env(self): args = self._gen_default_args() args.set_env = ["PATH=/my/custom/path"] self.submitter.set_args(args) old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '') try: os.environ['LD_LIBRARY_PATH'] = '/test_path' code = self.submitter._generate_pipes_code() self.assertTrue('export PATH="/my/custom/path"' in code) self.assertTrue('export PYTHONPATH=' in code) self.assertTrue('export LD_LIBRARY_PATH="/test_path"' in code) finally: os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path def test_generate_code_no_env_override(self): args = self._gen_default_args() args.no_override_env = True self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertFalse('export PATH=' in code) self.assertFalse('export LD_LIBRARY_PATH="/test_path"' in code) # PYTHONPATH should still be there because we add the hadoop working directory self.assertTrue('export PYTHONPATH=' in code) def test_generate_code_no_env_override_with_set_env(self): args = self._gen_default_args() args.no_override_env = True args.set_env = ["PATH=/my/custom/path"] self.submitter.set_args(args) code = self.submitter._generate_pipes_code() self.assertTrue('export PATH="/my/custom/path"' in code) self.assertFalse('export LD_LIBRARY_PATH="/test_path"' in code) # PYTHONPATH should still be there because we add the hadoop working directory self.assertTrue('export PYTHONPATH=' in code) def test_env_arg_to_dict(self): env_arg = ['var1=value1', ' var2 = value2 ', 'var3 = str with = sign'] d = self.submitter._env_arg_to_dict(env_arg) self.assertEquals('value1', d['var1']) self.assertEquals('value2', d['var2']) self.assertEquals('str with = sign', d['var3'])