def test_pydoop_jar_path(self): jar_path = pydoop.jar_path() if jar_path is not None: self.assertTrue(os.path.exists(jar_path)) directory, filename = os.path.split(jar_path) self.assertEqual(filename, pydoop.jar_name()) self.assertEqual('pydoop', os.path.basename(directory))
def run_pipes( executable, input_path, output_path, more_args=None, properties=None, force_pydoop_submitter=False, hadoop_conf_dir=None, logger=None, keep_streams=False, ): """ Run a pipes command. ``more_args`` (after setting input/output path) and ``properties`` are passed to :func:`run_cmd`. If not specified otherwise, this function sets the properties ``hadoop.pipes.java.recordreader`` and ``hadoop.pipes.java.recordwriter`` to ``"true"``. This function works around a bug in Hadoop pipes that affects versions of Hadoop with security when the local file system is used as the default FS (no HDFS); see https://issues.apache.org/jira/browse/MAPREDUCE-4000. In those set-ups, the function uses Pydoop's own pipes submitter application. You can force the use of Pydoop's submitter by passing the argument force_pydoop_submitter=True. """ if logger is None: logger = utils.NullLogger() if not hdfs.path.exists(executable): raise IOError("executable %s not found" % executable) if not hdfs.path.exists(input_path) and not (set(input_path) & GLOB_CHARS): raise IOError("input path %s not found" % input_path) if properties is None: properties = {} properties.setdefault("hadoop.pipes.java.recordreader", "true") properties.setdefault("hadoop.pipes.java.recordwriter", "true") if force_pydoop_submitter: use_pydoop_submit = True else: use_pydoop_submit = False ver = pydoop.hadoop_version_info() if ver.has_security(): if ver.is_cdh_mrv2() and hdfs.default_is_local(): raise RuntimeError("mrv2 on local fs not supported yet") use_pydoop_submit = hdfs.default_is_local() args = ["-program", executable, "-input", input_path, "-output", output_path] if more_args is not None: args.extend(more_args) if use_pydoop_submit: submitter = "it.crs4.pydoop.pipes.Submitter" pydoop_jar = pydoop.jar_path() args.extend(("-libjars", pydoop_jar)) return run_class(submitter, args, properties, classpath=pydoop_jar, logger=logger, keep_streams=keep_streams) else: return run_cmd( "pipes", args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger, keep_streams=keep_streams )
def run(self): if self.args is None: raise RuntimeError("cannot run without args, please call set_args") self.__validate() pipes_args = [] output_format = self.properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if self.properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: self.properties[ 'mapred.output.format.class'] = NOSEP_OUTPUT_FORMAT pipes_args.extend(['-libjars', pydoop_jar]) else: warnings.warn( "Can't find pydoop.jar, output will probably be tab-separated" ) try: self.__setup_remote_paths() hadut.run_pipes(self.remote_exe, self.args.input, self.args.output, more_args=pipes_args, properties=self.properties, logger=self.logger) self.logger.info("Done") finally: self.__clean_wd()
def run(self): if self.args is None: raise RuntimeError("cannot run without args, please call set_args") self.__validate() pipes_args = [] output_format = self.properties.get( 'mapred.output.format.class', DEFAULT_OUTPUT_FORMAT ) if output_format == DEFAULT_OUTPUT_FORMAT: if self.properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: self.properties['mapred.output.format.class'] = NOSEP_OUTPUT_FORMAT pipes_args.extend(['-libjars', pydoop_jar]) else: warnings.warn( "Can't find pydoop.jar, output will probably be tab-separated" ) try: self.__setup_remote_paths() hadut.run_pipes(self.remote_exe, self.args.input, self.args.output, more_args=pipes_args, properties=self.properties, logger=self.logger ) self.logger.info("Done") finally: self.__clean_wd()
def run(self): if self.args is None: raise RuntimeError("cannot run without args, please call set_args") self.__validate() pydoop_classpath = [] libjars = [] if self.args.libjars: libjars.extend(self.args.libjars) if self.args.avro_input or self.args.avro_output: # append Pydoop's avro-mapred jar. Don't put it at the front of # the list or the user won't be able to override it. avro_jars = glob.glob( os.path.join(pydoop.package_dir(), "avro*.jar")) pydoop_classpath.extend(avro_jars) libjars.extend(avro_jars) pydoop_jar = pydoop.jar_path() if pydoop_jar is None: raise RuntimeError("Can't find pydoop.jar") job_args = [] submitter_class = 'it.crs4.pydoop.mapreduce.pipes.Submitter' pydoop_classpath.append(pydoop_jar) libjars.append(pydoop_jar) self.logger.debug("Submitter class: %s", submitter_class) if self.args.hadoop_conf: job_args.extend(['-conf', self.args.hadoop_conf.name]) if self.args.input_format: job_args.extend(['-inputformat', self.args.input_format]) if self.args.output_format: job_args.extend(['-writer', self.args.output_format]) job_args.extend(['-input', self.args.input]) job_args.extend(['-output', self.args.output]) job_args.extend(['-program', self.remote_exe]) if libjars: job_args.extend(["-libjars", ','.join(libjars)]) if self.args.avro_input: job_args.extend(['-avroInput', self.args.avro_input]) if self.args.avro_output: job_args.extend(['-avroOutput', self.args.avro_output]) if not self.args.disable_property_name_conversion: ctable = conv_tables.mrv1_to_mrv2 props = [(ctable.get(k, k), v) for (k, v) in self.properties.items()] self.properties = dict(props) self.logger.debug("properties after projection: %r", self.properties) try: self.__setup_remote_paths() executor = (hadut.run_class if not self.args.pretend else self.fake_run_class) executor(submitter_class, args=job_args, properties=self.properties, classpath=pydoop_classpath, logger=self.logger, keep_streams=False) self.logger.info("Done") finally: if not self.args.keep_wd: self.__clean_wd()
def convert_args(self, args, unknown_args): # Create a zip archive containing all we need to run the # script (including the script itself. We use # NamedTemporaryFile which will take care of deleting the temp # archive once we're done self.script_archive = NamedTemporaryFile(prefix="pydoop_script_", suffix='.zip') zip_filename = self.script_archive.name # Create a one-off temporary file name to avoid name clashes # in the distcache. Keep the same module extension -- it may # be a source file or a byte-compiled file mr_module = utils.make_random_str(prefix="pydoop_script_module_", postfix=os.path.basename( args.module)) mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module) zipf.writestr( mr_driver + '.py', self.generate_driver(os.path.splitext(mr_module)[0], args)) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.conf = None args.disable_property_name_conversion = True args.job_conf = [('mapred.textoutputformat.separator', args.kv_separator)] args.avro_input = None args.avro_output = None args.keep_wd = False args.pstats_dir = None args.pstats_fmt = None # despicable hack... properties = dict(args.D or []) properties.update(dict(args.job_conf)) output_format = properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: args.output_format = NOSEP_OUTPUT_FORMAT args.libjars = [pydoop_jar] else: warnings.warn(("Can't find pydoop.jar, output will " "probably be tab-separated")) self.args, self.unknown_args = args, unknown_args
def run(self): if self.args is None: raise RuntimeError("cannot run without args, please call set_args") self.__validate() libjars = [] if self.args.libjars: libjars.extend(self.args.libjars) pydoop_jar = pydoop.jar_path() if self.args.mrv2 and pydoop_jar is None: raise RuntimeError("Can't find pydoop.jar, cannot switch to mrv2") if self.args.local_fs and pydoop_jar is None: raise RuntimeError( "Can't find pydoop.jar, cannot use local fs patch") job_args = [] if self.args.mrv2: submitter_class = 'it.crs4.pydoop.mapreduce.pipes.Submitter' classpath = pydoop_jar libjars.append(pydoop_jar) elif self.args.local_fs: # FIXME we still need to handle the special case with # hadoop security and local file system. raise RuntimeError("NOT IMPLEMENTED YET") # FIXME FAKE MODULE submitter_class = 'it.crs4.pydoop.mapred.pipes.Submitter' classpath = pydoop_jar libjars.append(pydoop_jar) else: submitter_class = 'org.apache.hadoop.mapred.pipes.Submitter' classpath = None if self.args.hadoop_conf: job_args.extend(['-conf', self.args.hadoop_conf.name]) job_args.extend(['-input', self.args.input]) job_args.extend(['-output', self.args.output]) job_args.extend(['-program', self.remote_exe]) if libjars: job_args.extend(["-libjars", ','.join(libjars)]) if not self.args.disable_property_name_conversion: ctable = (conv_tables.mrv1_to_mrv2 if self.args.mrv2 else conv_tables.mrv2_to_mrv1) props = [(ctable.get(k, k), v) for (k, v) in self.properties.iteritems()] self.properties = dict(props) self.logger.debug("properties after projection: %r", self.properties) try: self.__setup_remote_paths() executor = (hadut.run_class if not self.args.pretend else self.fake_run_class) executor(submitter_class, args=job_args, properties=self.properties, classpath=classpath, logger=self.logger) self.logger.info("Done") finally: self.__clean_wd()
def _run_java(self, in_uri, out_uri, wd): this_directory = os.path.abspath(os.path.dirname(__file__)) shutil.copytree(os.path.join(this_directory, _JAVA_SRC_ROOT), os.path.join(wd, _JAVA_SRC_ROOT)) classpath = '.:%s:%s:%s' % ( wd, pydoop.jar_path(), pydoop.hadoop_classpath()) src = os.path.join(wd, _OPAQUE_ROUNDTRIP_SRC) utils.compile_java(src, classpath) utils.run_java( _OPAQUE_ROUNDTRIP_CLASS, classpath, [in_uri, out_uri], wd)
def _run_java(self, in_uri, out_uri, wd): this_directory = os.path.abspath(os.path.dirname(__file__)) shutil.copytree(os.path.join(this_directory, _JAVA_SRC_ROOT), os.path.join(wd, _JAVA_SRC_ROOT)) classpath = '.:%s:%s:%s' % (wd, pydoop.jar_path(), pydoop.hadoop_classpath()) src = os.path.join(wd, _OPAQUE_ROUNDTRIP_SRC) utils.compile_java(src, classpath) utils.run_java(_OPAQUE_ROUNDTRIP_CLASS, classpath, [in_uri, out_uri], wd)
def convert_args(self, args, unknown_args): zip_filename = utils.make_random_str(prefix="pydoop_script_", postfix='.zip') mr_module = utils.make_random_str(prefix="pydoop_script_module_") mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module + '.py') zipf.writestr(mr_driver + '.py', self.generate_driver(mr_module, args)) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.input_format = None args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.mrv2 = pydoop.hadoop_version_info().has_mrv2() args.local_fs = False args.conf = None args.disable_property_name_conversion = True args.job_conf = [('mapred.textoutputformat.separator', args.kv_separator)] args.avro_input = None args.avro_output = None # despicable hack... properties = dict(args.D or []) properties.update(dict(args.job_conf)) output_format = properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: args.output_format = NOSEP_OUTPUT_FORMAT args.libjars = [pydoop_jar] else: warnings.warn(("Can't find pydoop.jar, output will " "probably be tab-separated")) self.args, self.unknown_args = args, unknown_args self.zip_filename = zip_filename
def convert_args(self, args, unknown_args): zip_filename = utils.make_random_str(prefix="pydoop_script_", postfix='.zip') mr_module = utils.make_random_str(prefix="pydoop_script_module_") mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module+'.py') zipf.writestr(mr_driver+'.py', self.generate_driver(mr_module, args)) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.input_format = None args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.mrv2 = pydoop.hadoop_version_info().has_mrv2() args.local_fs = False args.conf = None args.disable_property_name_conversion = True args.job_conf = [('mapred.textoutputformat.separator', args.kv_separator)] args.avro_input = None args.avro_output = None # despicable hack... properties = dict(args.D or []) properties.update(dict(args.job_conf)) output_format = properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: args.output_format = NOSEP_OUTPUT_FORMAT args.libjars = [pydoop_jar] else: warnings.warn(("Can't find pydoop.jar, output will " "probably be tab-separated")) self.args, self.unknown_args = args, unknown_args self.zip_filename = zip_filename
def run_pipes(executable, input_path, output_path, more_args=None, properties=None, force_pydoop_submitter=False, hadoop_conf_dir=None, logger=None): """ Run a pipes command. ``more_args`` (after setting input/output path) and ``properties`` are passed to :func:`run_cmd`. If not specified otherwise, this function sets the properties hadoop.pipes.java.recordreader and hadoop.pipes.java.recordwriter to 'true'. This function works around a bug in Hadoop pipes that affects versions of Hadoop with security when the local file system is used as the default FS (no HDFS); see https://issues.apache.org/jira/browse/MAPREDUCE-4000. In those set-ups, the function uses Pydoop's own pipes submitter application. You can force the use of Pydoop's submitter by passing the argument force_pydoop_submitter=True. """ if logger is None: logger = utils.NullLogger() if not hdfs.path.exists(executable): raise IOError("executable %s not found" % executable) if not hdfs.path.exists(input_path) and not (set(input_path) & GLOB_CHARS): raise IOError("input path %s not found" % input_path) if properties is None: properties = {} properties.setdefault('hadoop.pipes.java.recordreader', 'true') properties.setdefault('hadoop.pipes.java.recordwriter', 'true') if force_pydoop_submitter: use_pydoop_submit = True else: use_pydoop_submit = False ver = pydoop.hadoop_version_info() if ver.has_security(): if ver.cdh >= (4, 0, 0) and not ver.ext and hdfs.default_is_local(): raise RuntimeError( "mrv2 on local fs not supported yet") # FIXME use_pydoop_submit = hdfs.default_is_local() args = [ "-program", executable, "-input", input_path, "-output", output_path ] if more_args is not None: args.extend(more_args) if use_pydoop_submit: submitter = "it.crs4.pydoop.pipes.Submitter" pydoop_jar = pydoop.jar_path() args.extend(("-libjars", pydoop_jar)) return run_class(submitter, args, properties, classpath=pydoop_jar, logger=logger) else: return run_cmd("pipes", args, properties, hadoop_conf_dir=hadoop_conf_dir, logger=logger)
def convert_args(self, args, unknown_args): # Create a zip archive containing all we need to run the # script (including the script itself. We use # NamedTemporaryFile which will take care of deleting the temp # archive once we're done self.script_archive = NamedTemporaryFile( prefix="pydoop_script_", suffix='.zip' ) zip_filename = self.script_archive.name # Create a one-off temporary file name to avoid name clashes # in the distcache. Keep the same module extension -- it may # be a source file or a byte-compiled file mr_module = utils.make_random_str( prefix="pydoop_script_module_", postfix=os.path.basename(args.module) ) mr_driver = utils.make_random_str(prefix="pydoop_script_driver_") with ZipFile(zip_filename, 'w') as zipf: zipf.write(args.module, arcname=mr_module) zipf.writestr( mr_driver + '.py', self.generate_driver(os.path.splitext(mr_module)[0], args) ) if args.python_zip is None: args.python_zip = [zip_filename] else: args.python_zip.append(zip_filename) args.module = mr_driver args.entry_point = 'main' args.program = mr_driver args.do_not_use_java_record_reader = False args.do_not_use_java_record_writer = False args.output_format = None args.cache_file = None args.cache_archive = None args.upload_to_cache = None args.libjars = None args.conf = None args.disable_property_name_conversion = True args.job_conf = [('mapred.textoutputformat.separator', args.kv_separator)] args.avro_input = None args.avro_output = None args.keep_wd = False args.pstats_dir = None args.pstats_fmt = None # despicable hack... properties = dict(args.D or []) properties.update(dict(args.job_conf)) output_format = properties.get('mapred.output.format.class', DEFAULT_OUTPUT_FORMAT) if output_format == DEFAULT_OUTPUT_FORMAT: if properties['mapred.textoutputformat.separator'] == '': pydoop_jar = pydoop.jar_path() if pydoop_jar is not None: args.output_format = NOSEP_OUTPUT_FORMAT args.libjars = [pydoop_jar] else: warnings.warn(("Can't find pydoop.jar, output will " "probably be tab-separated")) self.args, self.unknown_args = args, unknown_args
def run(self): if self.args is None: raise RuntimeError("cannot run without args, please call set_args") self.__validate() classpath = [] libjars = [] if self.args.avro_input or self.args.avro_output: avro_jars = glob.glob(os.path.join( pydoop.package_dir(), "avro*.jar" )) classpath.extend(avro_jars) libjars.extend(avro_jars) if self.args.libjars: libjars.extend(self.args.libjars) pydoop_jar = pydoop.jar_path() if self.args.mrv2 and pydoop_jar is None: raise RuntimeError("Can't find pydoop.jar, cannot switch to mrv2") if self.args.local_fs and pydoop_jar is None: raise RuntimeError( "Can't find pydoop.jar, cannot use local fs patch" ) job_args = [] if self.args.mrv2: submitter_class = 'it.crs4.pydoop.mapreduce.pipes.Submitter' classpath.append(pydoop_jar) libjars.append(pydoop_jar) elif self.args.local_fs: # FIXME we still need to handle the special case with # hadoop security and local file system. raise RuntimeError("NOT IMPLEMENTED YET") # FIXME FAKE MODULE submitter_class = 'it.crs4.pydoop.mapred.pipes.Submitter' classpath.append(pydoop_jar) libjars.append(pydoop_jar) else: submitter_class = 'org.apache.hadoop.mapred.pipes.Submitter' if self.args.hadoop_conf: job_args.extend(['-conf', self.args.hadoop_conf.name]) if self.args.input_format: job_args.extend(['-inputformat', self.args.input_format]) if self.args.output_format: job_args.extend(['-writer', self.args.output_format]) job_args.extend(['-input', self.args.input]) job_args.extend(['-output', self.args.output]) job_args.extend(['-program', self.remote_exe]) if libjars: job_args.extend(["-libjars", ','.join(libjars)]) if self.args.avro_input: job_args.extend(['-avroInput', self.args.avro_input]) if self.args.avro_output: job_args.extend(['-avroOutput', self.args.avro_output]) if not self.args.disable_property_name_conversion: ctable = (conv_tables.mrv1_to_mrv2 if self.args.mrv2 else conv_tables.mrv2_to_mrv1) props = [ (ctable.get(k, k), v) for (k, v) in self.properties.iteritems() ] self.properties = dict(props) self.logger.debug("properties after projection: %r", self.properties) try: self.__setup_remote_paths() executor = (hadut.run_class if not self.args.pretend else self.fake_run_class) executor(submitter_class, args=job_args, properties=self.properties, classpath=classpath, logger=self.logger, keep_streams=False) self.logger.info("Done") finally: self.__clean_wd()