Ejemplo n.º 1
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pterasort'
    args.module = 'pterasort'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = True
    bp_filename = Partitioner.initialize_break_points(args.num_reducers,
                                                      args.sampled_records,
                                                      args.input,
                                                      args.num_threads)
    args.upload_file_to_cache = ['pterasort.py', 'ioformats.py', bp_filename]
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
Ejemplo n.º 2
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteragen'
    args.module = 'pteragen'
    args.upload_file_to_cache = ['pteragen.py', 'ioformats.py']
    args.input_format = 'it.crs4.pydoop.examples.pterasort.RangeInputFormat'
    args.do_not_use_java_record_writer = True
    # args.libjars = ['pydoop-input-formats.jar']
    add_D_arg(args, 'num_records', NUM_ROWS_KEY)
    add_D_arg(args, 'num_maps', NUM_MAPS_KEY)
    args.num_reducers = 0
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
Ejemplo n.º 3
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteracheck'
    args.module = 'pteracheck'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = False
    args.num_reducers = 1
    args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py']
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
    path = os.path.join(args.output, 'part-r-00000')
    with hdfs.open(path, 'rb') as f:
        data = f.read()
    check_rows(data.split(b'\n')[:-1])
Ejemplo n.º 4
0
def run_mapred(model, input_dirs, output_dir, nmaps, log_level, collate=False):
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE))
    shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE)
    if nmaps > len(input_dirs):
        nmaps = len(input_dirs)
        LOGGER.warn("Not enough input dirs, will only do %d splits" % nmaps)
    splits = common.balanced_split(input_dirs, nmaps)
    splits_uri = "pydoop_splits_%s" % uuid.uuid4().hex
    with hdfs.open(splits_uri, 'wb') as f:
        write_opaques([OpaqueInputSplit(1, _) for _ in splits], f)
    submitter = PydoopSubmitter()
    properties = {
        common.GRAPH_ARCH_KEY: model.name,
        common.LOG_LEVEL_KEY: log_level,
        common.NUM_MAPS_KEY: nmaps,
        common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_uri,
    }
    submitter.set_args(
        argparse.Namespace(
            D=list(properties.items()),
            avro_input=None,
            avro_output=None,
            cache_archive=None,
            cache_file=None,
            disable_property_name_conversion=True,
            do_not_use_java_record_reader=True,
            do_not_use_java_record_writer=True,
            entry_point="__main__",
            hadoop_conf=None,
            input=input_dirs[0],  # does it matter?
            input_format=None,
            job_conf=None,
            job_name="dump_weights",
            keep_wd=False,
            libjars=None,
            log_level=log_level,
            module=os.path.splitext(os.path.basename(__file__))[0],
            no_override_env=False,
            no_override_home=False,
            no_override_ld_path=False,
            no_override_path=False,
            no_override_pypath=False,
            num_reducers=0,
            output=output_dir,
            output_format=None,
            pretend=False,
            pstats_dir=None,
            python_program=sys.executable,
            python_zip=[zip_fn],
            set_env=None,
            upload_archive_to_cache=None,
            upload_file_to_cache=[__file__],
        ))
    submitter.run()
    hdfs.rmr(splits_uri)
    if collate:
        collate_mapred_output(output_dir)
    shutil.rmtree(wd)
Ejemplo n.º 5
0
def main(argv=None):

    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE))
    shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE)

    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = WORKER
    args.module = WORKER
    args.upload_file_to_cache = ['%s.py' % WORKER]
    args.python_zip = [zip_fn]
    args.do_not_use_java_record_reader = True
    args.num_reducers = 0
    if args.seed:
        LOGGER.info("setting random seed to %d", args.seed)
        random.seed(args.seed)

    model = models.get_model_info(args.architecture)
    graph = model.load_prep()
    bneck_tensor = model.get_bottleneck(graph)
    bneck_store = ioformats.BottleneckStore(
        bneck_tensor.shape[1].value, bneck_tensor.dtype
    )
    bneck_map = bneck_store.build_map(args.input)
    LOGGER.info("%d subdirs, %r bottlenecks" %
                (len(bneck_map), [len(_) for _ in bneck_map.values()]))
    splits_path = os.path.join(args.input, '_' + uuid.uuid4().hex)
    generate_input_splits(args.num_maps, bneck_map, splits_path)
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.properties.update({
        common.BNECKS_DIR_KEY: args.input,
        common.EVAL_STEP_INTERVAL_KEY: args.eval_step_interval,
        common.GRAPH_ARCH_KEY: args.architecture,
        common.LEARNING_RATE_KEY: args.learning_rate,
        common.LOG_LEVEL_KEY: args.log_level,
        common.NUM_MAPS_KEY: args.num_maps,
        common.NUM_STEPS_KEY: args.num_steps,
        common.PYDOOP_EXTERNALSPLITS_URI_KEY: splits_path,
        common.TRAIN_BATCH_SIZE_KEY: args.train_batch_size,
        common.VALIDATION_BATCH_SIZE_KEY: args.validation_batch_size,
        common.VALIDATION_PERCENT_KEY: args.validation_percent,
    })
    if args.seed:
        submitter.properties[common.SEED_KEY] = args.seed
    submitter.run()
    hdfs.rmr(splits_path)
    shutil.rmtree(wd)
Ejemplo n.º 6
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pterasort'
    args.module = 'pterasort'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = True
    bp_filename = Partitioner.initialize_break_points(args.num_reducers,
                                                      args.sampled_records,
                                                      args.input,
                                                      args.num_threads)
    args.upload_file_to_cache = ['pterasort.py', 'ioformats.py', bp_filename]
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
Ejemplo n.º 7
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteragen'
    args.module = 'pteragen'
    args.upload_file_to_cache = ['pteragen.py', 'ioformats.py']
    args.input_format = 'it.crs4.pydoop.examples.pterasort.RangeInputFormat'
    args.do_not_use_java_record_writer = True
    # args.libjars = ['pydoop-input-formats.jar']
    add_D_arg(args, 'num_records', NUM_ROWS_KEY)
    add_D_arg(args, 'num_maps', NUM_MAPS_KEY)
    args.num_reducers = 0
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
Ejemplo n.º 8
0
def main(argv=None):
    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = 'pteracheck'
    args.module = 'pteracheck'
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = False
    args.num_reducers = 1
    args.upload_file_to_cache = ['pteracheck.py', 'ioformats.py']
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.run()
    path = os.path.join(args.output, 'part-r-00000')
    with hdfs.open(path, 'rb') as f:
        data = f.read()
    check_rows(data.split(b'\n')[:-1])
Ejemplo n.º 9
0
def main(argv=None):

    os.chdir(os.path.dirname(os.path.abspath(__file__)))
    wd = tempfile.mkdtemp(prefix="pydeep_")
    zip_fn = os.path.join(wd, "{}.zip".format(PACKAGE))
    shutil.make_archive(*zip_fn.rsplit(".", 1), base_dir=PACKAGE)

    parser = make_parser()
    args, unknown_args = parser.parse_known_args(argv)
    args.job_name = WORKER
    args.module = WORKER
    args.upload_file_to_cache = ['%s.py' % WORKER]
    args.python_zip = [zip_fn]
    args.do_not_use_java_record_reader = True
    args.do_not_use_java_record_writer = True
    args.num_reducers = 0

    LOGGER.setLevel(args.log_level)
    model = get_model_info(args.architecture)
    get_graph(model, log_level=args.log_level)

    images = list_images(args.input)
    splits = common.balanced_split(images, args.num_maps)
    uri = os.path.join(args.input, '_' + uuid.uuid4().hex)
    LOGGER.debug("saving input splits to: %s", uri)
    with hdfs.open(uri, 'wb') as f:
        write_opaques([OpaqueInputSplit(1, _) for _ in splits], f)
    submitter = PydoopSubmitter()
    submitter.set_args(args, [] if unknown_args is None else unknown_args)
    submitter.properties.update({
        common.NUM_MAPS_KEY: args.num_maps,
        common.GRAPH_ARCH_KEY: args.architecture,
        common.PYDOOP_EXTERNALSPLITS_URI_KEY: uri,
    })
    submitter.run()
    hdfs.rmr(uri)
    shutil.rmtree(wd)
Ejemplo n.º 10
0
 def setUp(self):
     self.submitter = PydoopSubmitter()
Ejemplo n.º 11
0
class TestAppSubmit(unittest.TestCase):

    def setUp(self):
        self.submitter = PydoopSubmitter()

    @staticmethod
    def _gen_default_args():
        return Args(
            entry_point='__main__',
            log_level='INFO',
            module='the_module',
            no_override_env=False,
            no_override_home=False,
            python_program='python',
            output="output_path",
            job_name="job_name",
            num_reducers=0,
            )

    def test_help(self):
        parser = app.make_parser()
        # silence!
        for k in ['submit', 'script']:
            parser._actions[2].choices[k].format_help = nop
            parser._actions[2].choices[k].format_usage = nop
            parser._actions[2].choices[k].error = nop
        parser.format_help = nop
        parser.format_usage = nop
        parser.error = nop
        try:
            args, unk = parser.parse_known_args(['-h'])
        except SystemExit as e:
            self.assertEqual(e.message, 0)
        try:
            args, unk = parser.parse_known_args(['submit', '-h'])
        except SystemExit as e:
            self.assertEqual(e.message, 0)
        try:
            args, unk = parser.parse_known_args(['submit'])
        except SystemExit as e:
            self.assertEqual(e.message, 2)

    def _check_args(self, args, args_kv):
        for k, v in args_kv:
            k = re.sub("^--", "", k).replace('-', '_')
            self.assertTrue(hasattr(args, k))
            v1 = getattr(args, k)
            if v is None:
                self.assertEqual(v1, True)
            elif type(v1) is list:
                pass
            else:
                self.assertEqual(v1, v)

    def test_conf_file(self):
        wd = tempfile.mkdtemp(prefix='pydoop_')
        conf_file = os.path.join(wd, 'pydoop.conf')
        args_kv = (("--pretend", None),
                   ("--mrv2", None),
                   ("--input-format", 'mapreduce.lib.input.TextInputFormat'),
                   ("--output-format", 'mapreduce.lib.input.TextOutputFormat'),
                   ("--num-reducers", 10),
                   ("--python-zip", 'allmymodules.zip'),
                   )
        try:
            with open(conf_file, 'w') as cf:
                d = ''.join(['{}\n{}\n'.format(k, v)
                             if v is not None else '{}\n'.format(k)
                             for (k, v) in args_kv])
                cf.write(d)
            parser = app.make_parser()
            parser.format_help = nop
            module = 'mymod1.mod2.mod3'
            ainput = 'input'
            aoutput = 'output'
            argv = ['submit', module, ainput, aoutput, '@' + conf_file]
            [args, unknown] = parser.parse_known_args(argv)
            self.assertEqual(args.module, module)
            self.assertEqual(args.input, ainput)
            self.assertEqual(args.output, aoutput)
            self.assertEqual(len(unknown), 0)
            self._check_args(args, args_kv)
        finally:
            shutil.rmtree(wd)

    def test_empty_param(self):
        parser = app.make_parser()
        parser.format_help = nop
        program = 'program'
        ainput = 'input'
        aoutput = 'output'
        argv = ['submit', '--module', '', program, ainput, aoutput]
        [args, unknown] = parser.parse_known_args(argv)
        self.assertEqual(args.module, '')

    def test_generate_pipes_code_env(self):
        args = self._gen_default_args()
        self.submitter.set_args(args)
        old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '')

        try:
            # we set this variable for this test since it may not be set in
            # the environment
            os.environ['LD_LIBRARY_PATH'] = '/test_path'
            code = self.submitter._generate_pipes_code()
            self.assertTrue('export PATH=' in code)
            self.assertTrue('export PYTHONPATH=' in code)
            self.assertTrue('export LD_LIBRARY_PATH="/test_path"' in code)
        finally:
            os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path

    def test_generate_pipes_code_no_override_ld_path(self):
        args = self._gen_default_args()
        args.no_override_ld_path = True
        self.submitter.set_args(args)
        old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '')

        try:
            os.environ['LD_LIBRARY_PATH'] = '/test_path'
            code = self.submitter._generate_pipes_code()
            self.assertTrue('export PYTHONPATH=' in code)
            self.assertFalse('export LD_LIBRARY_PATH=' in code)
        finally:
            os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path

    def test_generate_pipes_code_no_override_path(self):
        args = self._gen_default_args()
        args.no_override_path = True
        self.submitter.set_args(args)

        code = self.submitter._generate_pipes_code()
        self.assertTrue('export PYTHONPATH=' in code)
        self.assertFalse('export PATH=' in code)

    def test_generate_pipes_code_no_override_pythonpath(self):
        args = self._gen_default_args()
        args.no_override_pypath = True
        self.submitter.set_args(args)

        code = self.submitter._generate_pipes_code()
        self.assertTrue('export PYTHONPATH="${PWD}:${PYTHONPATH}"' in code)
        self.assertTrue('export PATH=' in code)


    def test_generate_pipes_code_with_set_env(self):
        args = self._gen_default_args()
        args.set_env = ["PATH=/my/custom/path"]
        self.submitter.set_args(args)
        old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '')

        try:
            os.environ['LD_LIBRARY_PATH'] = '/test_path'
            code = self.submitter._generate_pipes_code()
            self.assertTrue('export PATH="/my/custom/path"' in code)
            self.assertTrue('export PYTHONPATH=' in code)
            self.assertTrue('export LD_LIBRARY_PATH="/test_path"' in code)
        finally:
            os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path

    def test_generate_code_no_env_override(self):
        args = self._gen_default_args()
        args.no_override_env = True
        self.submitter.set_args(args)

        code = self.submitter._generate_pipes_code()
        self.assertFalse('export PATH=' in code)
        self.assertFalse('export LD_LIBRARY_PATH="/test_path"' in code)
        # PYTHONPATH should still be there because we add the hadoop working directory
        self.assertTrue('export PYTHONPATH=' in code)

    def test_generate_code_no_env_override_with_set_env(self):
        args = self._gen_default_args()
        args.no_override_env = True
        args.set_env = ["PATH=/my/custom/path"]
        self.submitter.set_args(args)

        code = self.submitter._generate_pipes_code()

        self.assertTrue('export PATH="/my/custom/path"' in code)
        self.assertFalse('export LD_LIBRARY_PATH="/test_path"' in code)
        # PYTHONPATH should still be there because we add the hadoop working directory
        self.assertTrue('export PYTHONPATH=' in code)


    def test_env_arg_to_dict(self):
        env_arg = [ 'var1=value1', ' var2 = value2 ', 'var3 = str with = sign' ]
        d = self.submitter._env_arg_to_dict(env_arg)
        self.assertEquals('value1', d['var1'])
        self.assertEquals('value2', d['var2'])
        self.assertEquals('str with = sign', d['var3'])
Ejemplo n.º 12
0
 def setUp(self):
     self.submitter = PydoopSubmitter()
Ejemplo n.º 13
0
class TestAppSubmit(unittest.TestCase):
    def setUp(self):
        self.submitter = PydoopSubmitter()

    @staticmethod
    def _gen_default_args():
        return Args(
            entry_point='__main__',
            log_level='INFO',
            module='the_module',
            no_override_env=False,
            no_override_home=False,
            python_program='python',
            output="output_path",
            job_name="job_name",
            num_reducers=0,
        )

    def test_help(self):
        parser = app.make_parser()
        # silence!
        for k in ['submit', 'script']:
            parser._actions[2].choices[k].format_help = nop
            parser._actions[2].choices[k].format_usage = nop
            parser._actions[2].choices[k].error = nop
        parser.format_help = nop
        parser.format_usage = nop
        parser.error = nop
        try:
            args, unk = parser.parse_known_args(['-h'])
        except SystemExit as e:
            self.assertEqual(e.message, 0)
        try:
            args, unk = parser.parse_known_args(['submit', '-h'])
        except SystemExit as e:
            self.assertEqual(e.message, 0)
        try:
            args, unk = parser.parse_known_args(['submit'])
        except SystemExit as e:
            self.assertEqual(e.message, 2)

    def _check_args(self, args, args_kv):
        for k, v in args_kv:
            k = re.sub("^--", "", k).replace('-', '_')
            self.assertTrue(hasattr(args, k))
            v1 = getattr(args, k)
            if v is None:
                self.assertEqual(v1, True)
            elif type(v1) is list:
                pass
            else:
                self.assertEqual(v1, v)

    def test_conf_file(self):
        wd = tempfile.mkdtemp(prefix='pydoop_')
        conf_file = os.path.join(wd, 'pydoop.conf')
        args_kv = (
            ("--pretend", None),
            ("--mrv2", None),
            ("--input-format", 'mapreduce.lib.input.TextInputFormat'),
            ("--output-format", 'mapreduce.lib.input.TextOutputFormat'),
            ("--num-reducers", 10),
            ("--python-zip", 'allmymodules.zip'),
        )
        try:
            with open(conf_file, 'w') as cf:
                d = ''.join([
                    '{}\n{}\n'.format(k, v)
                    if v is not None else '{}\n'.format(k)
                    for (k, v) in args_kv
                ])
                cf.write(d)
            parser = app.make_parser()
            parser.format_help = nop
            module = 'mymod1.mod2.mod3'
            ainput = 'input'
            aoutput = 'output'
            argv = ['submit', module, ainput, aoutput, '@' + conf_file]
            [args, unknown] = parser.parse_known_args(argv)
            self.assertEqual(args.module, module)
            self.assertEqual(args.input, ainput)
            self.assertEqual(args.output, aoutput)
            self.assertEqual(len(unknown), 0)
            self._check_args(args, args_kv)
        finally:
            shutil.rmtree(wd)

    def test_empty_param(self):
        parser = app.make_parser()
        parser.format_help = nop
        program = 'program'
        ainput = 'input'
        aoutput = 'output'
        argv = ['submit', '--module', '', program, ainput, aoutput]
        [args, unknown] = parser.parse_known_args(argv)
        self.assertEqual(args.module, '')

    def test_generate_pipes_code_env(self):
        args = self._gen_default_args()
        self.submitter.set_args(args)
        old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '')

        try:
            # we set this variable for this test since it may not be set in
            # the environment
            os.environ['LD_LIBRARY_PATH'] = '/test_path'
            code = self.submitter._generate_pipes_code()
            self.assertTrue('export PATH=' in code)
            self.assertTrue('export PYTHONPATH=' in code)
            self.assertTrue('export LD_LIBRARY_PATH="/test_path"' in code)
        finally:
            os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path

    def test_generate_pipes_code_no_override_ld_path(self):
        args = self._gen_default_args()
        args.no_override_ld_path = True
        self.submitter.set_args(args)
        old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '')

        try:
            os.environ['LD_LIBRARY_PATH'] = '/test_path'
            code = self.submitter._generate_pipes_code()
            self.assertTrue('export PYTHONPATH=' in code)
            self.assertFalse('export LD_LIBRARY_PATH=' in code)
        finally:
            os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path

    def test_generate_pipes_code_no_override_path(self):
        args = self._gen_default_args()
        args.no_override_path = True
        self.submitter.set_args(args)

        code = self.submitter._generate_pipes_code()
        self.assertTrue('export PYTHONPATH=' in code)
        self.assertFalse('export PATH=' in code)

    def test_generate_pipes_code_no_override_pythonpath(self):
        args = self._gen_default_args()
        args.no_override_pypath = True
        self.submitter.set_args(args)

        code = self.submitter._generate_pipes_code()
        self.assertTrue('export PYTHONPATH="${PWD}:${PYTHONPATH}"' in code)
        self.assertTrue('export PATH=' in code)

    def test_generate_pipes_code_with_set_env(self):
        args = self._gen_default_args()
        args.set_env = ["PATH=/my/custom/path"]
        self.submitter.set_args(args)
        old_ld_lib_path = os.environ.get('LD_LIBRARY_PATH', '')

        try:
            os.environ['LD_LIBRARY_PATH'] = '/test_path'
            code = self.submitter._generate_pipes_code()
            self.assertTrue('export PATH="/my/custom/path"' in code)
            self.assertTrue('export PYTHONPATH=' in code)
            self.assertTrue('export LD_LIBRARY_PATH="/test_path"' in code)
        finally:
            os.environ['LD_LIBRARY_PATH'] = old_ld_lib_path

    def test_generate_code_no_env_override(self):
        args = self._gen_default_args()
        args.no_override_env = True
        self.submitter.set_args(args)

        code = self.submitter._generate_pipes_code()
        self.assertFalse('export PATH=' in code)
        self.assertFalse('export LD_LIBRARY_PATH="/test_path"' in code)
        # PYTHONPATH should still be there because we add the hadoop working directory
        self.assertTrue('export PYTHONPATH=' in code)

    def test_generate_code_no_env_override_with_set_env(self):
        args = self._gen_default_args()
        args.no_override_env = True
        args.set_env = ["PATH=/my/custom/path"]
        self.submitter.set_args(args)

        code = self.submitter._generate_pipes_code()

        self.assertTrue('export PATH="/my/custom/path"' in code)
        self.assertFalse('export LD_LIBRARY_PATH="/test_path"' in code)
        # PYTHONPATH should still be there because we add the hadoop working directory
        self.assertTrue('export PYTHONPATH=' in code)

    def test_env_arg_to_dict(self):
        env_arg = ['var1=value1', ' var2 = value2 ', 'var3 = str with = sign']
        d = self.submitter._env_arg_to_dict(env_arg)
        self.assertEquals('value1', d['var1'])
        self.assertEquals('value2', d['var2'])
        self.assertEquals('str with = sign', d['var3'])