Example #1
0
 def set_args(self, args):
   """
   Configure the pydoop script run, based on the arguments provided.
   """
   self.logger.setLevel(getattr(logging, args.log_level))
   parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/")))
   self.remote_wd = hdfs.path.join(
     parent, utils.make_random_str(prefix="pydoop_script_")
     )
   self.remote_exe = hdfs.path.join(
     self.remote_wd, utils.make_random_str(prefix="exe")
     )
   module_bn = os.path.basename(args.module)
   self.remote_module_bn = utils.make_random_str(
     prefix="pydoop_script_", postfix=".py"
     )
   self.remote_module = hdfs.path.join(self.remote_wd, self.remote_module_bn)
   dist_cache_parameter = "%s#%s" % (self.remote_module, self.remote_module_bn)
   self.properties['mapred.job.name'] = module_bn
   self.properties.update(dict(args.D or []))
   self.properties['mapred.reduce.tasks'] = args.num_reducers
   self.properties['mapred.textoutputformat.separator'] = args.kv_separator
   if self.properties['mapred.cache.files']:
     self.properties['mapred.cache.files'] += ','
   self.properties['mapred.cache.files'] += dist_cache_parameter
   self.args = args
Example #2
0
 def set_args(self, args):
     """
 Configure the pydoop script run, based on the arguments provided.
 """
     self.logger.setLevel(getattr(logging, args.log_level))
     parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/")))
     self.remote_wd = hdfs.path.join(
         parent, utils.make_random_str(prefix="pydoop_script_"))
     self.remote_exe = hdfs.path.join(self.remote_wd,
                                      utils.make_random_str(prefix="exe"))
     module_bn = os.path.basename(args.module)
     _, ext = module_ext = os.path.splitext(module_bn)
     # If the module doesn't have an extension, assume it should be .py
     # This could happen, for instance, if someone loads an executable module
     # as a script.  We can't blindly add .py though since the module may be a .pyc
     if not ext:
         ext = '.py'
     self.remote_module_bn = utils.make_random_str(prefix="pydoop_script_",
                                                   postfix=ext)
     self.remote_module = hdfs.path.join(self.remote_wd,
                                         self.remote_module_bn)
     dist_cache_parameter = "%s#%s" % (self.remote_module,
                                       self.remote_module_bn)
     self.properties['mapred.job.name'] = module_bn
     self.properties.update(dict(args.D or []))
     self.properties['mapred.reduce.tasks'] = args.num_reducers
     self.properties[
         'mapred.textoutputformat.separator'] = args.kv_separator
     if self.properties['mapred.cache.files']:
         self.properties['mapred.cache.files'] += ','
     self.properties['mapred.cache.files'] += dist_cache_parameter
     self.args = args
Example #3
0
 def set_args(self, args):
   """
   Configure the pydoop script run, based on the arguments provided.
   """
   self.logger.setLevel(getattr(logging, args.log_level))
   parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/")))
   self.remote_wd = hdfs.path.join(
     parent, utils.make_random_str(prefix="pydoop_script_")
     )
   self.remote_exe = hdfs.path.join(
     self.remote_wd, utils.make_random_str(prefix="exe")
     )
   module_bn = os.path.basename(args.module)
   _, ext = module_ext = os.path.splitext(module_bn)
   # If the module doesn't have an extension, assume it should be .py
   # This could happen, for instance, if someone loads an executable module
   # as a script.  We can't blindly add .py though since the module may be a .pyc
   if not ext:
     ext = '.py'
   self.remote_module_bn = utils.make_random_str(
     prefix="pydoop_script_", postfix=ext
     )
   self.remote_module = hdfs.path.join(self.remote_wd, self.remote_module_bn)
   dist_cache_parameter = "%s#%s" % (self.remote_module, self.remote_module_bn)
   self.properties['mapred.job.name'] = module_bn
   self.properties.update(dict(args.D or []))
   self.properties['mapred.reduce.tasks'] = args.num_reducers
   self.properties['mapred.textoutputformat.separator'] = args.kv_separator
   if self.properties['mapred.cache.files']:
     self.properties['mapred.cache.files'] += ','
   self.properties['mapred.cache.files'] += dist_cache_parameter
   self.args = args
Example #4
0
    def convert_args(self, args, unknown_args):
        # Create a zip archive containing all we need to run the
        # script (including the script itself.  We use
        # NamedTemporaryFile which will take care of deleting the temp
        # archive once we're done
        self.script_archive = NamedTemporaryFile(prefix="pydoop_script_",
                                                 suffix='.zip')
        zip_filename = self.script_archive.name
        # Create a one-off temporary file name to avoid name clashes
        # in the distcache.  Keep the same module extension -- it may
        # be a source file or a byte-compiled file
        mr_module = utils.make_random_str(prefix="pydoop_script_module_",
                                          postfix=os.path.basename(
                                              args.module))
        mr_driver = utils.make_random_str(prefix="pydoop_script_driver_")
        with ZipFile(zip_filename, 'w') as zipf:
            zipf.write(args.module, arcname=mr_module)
            zipf.writestr(
                mr_driver + '.py',
                self.generate_driver(os.path.splitext(mr_module)[0], args))
        if args.python_zip is None:
            args.python_zip = [zip_filename]
        else:
            args.python_zip.append(zip_filename)
        args.module = mr_driver
        args.entry_point = 'main'
        args.program = mr_driver
        args.do_not_use_java_record_reader = False
        args.do_not_use_java_record_writer = False
        args.output_format = None
        args.cache_file = None
        args.cache_archive = None
        args.upload_to_cache = None
        args.libjars = None
        args.conf = None
        args.disable_property_name_conversion = True
        args.job_conf = [('mapred.textoutputformat.separator',
                          args.kv_separator)]
        args.avro_input = None
        args.avro_output = None
        args.keep_wd = False
        args.pstats_dir = None
        args.pstats_fmt = None

        # despicable hack...
        properties = dict(args.D or [])
        properties.update(dict(args.job_conf))
        output_format = properties.get('mapred.output.format.class',
                                       DEFAULT_OUTPUT_FORMAT)
        if output_format == DEFAULT_OUTPUT_FORMAT:
            if properties['mapred.textoutputformat.separator'] == '':
                pydoop_jar = pydoop.jar_path()
                if pydoop_jar is not None:
                    args.output_format = NOSEP_OUTPUT_FORMAT
                    args.libjars = [pydoop_jar]
                else:
                    warnings.warn(("Can't find pydoop.jar, output will "
                                   "probably be tab-separated"))
        self.args, self.unknown_args = args, unknown_args
Example #5
0
    def convert_args(self, args, unknown_args):
        zip_filename = utils.make_random_str(prefix="pydoop_script_",
                                             postfix='.zip')
        mr_module = utils.make_random_str(prefix="pydoop_script_module_")
        mr_driver = utils.make_random_str(prefix="pydoop_script_driver_")
        with ZipFile(zip_filename, 'w') as zipf:
            zipf.write(args.module, arcname=mr_module + '.py')
            zipf.writestr(mr_driver + '.py',
                          self.generate_driver(mr_module, args))
        if args.python_zip is None:
            args.python_zip = [zip_filename]
        else:
            args.python_zip.append(zip_filename)
        args.module = mr_driver
        args.entry_point = 'main'
        args.program = mr_driver
        args.do_not_use_java_record_reader = False
        args.do_not_use_java_record_writer = False
        args.input_format = None
        args.output_format = None
        args.cache_file = None
        args.cache_archive = None
        args.upload_to_cache = None
        args.libjars = None
        args.mrv2 = pydoop.hadoop_version_info().has_mrv2()
        args.local_fs = False
        args.conf = None
        args.disable_property_name_conversion = True
        args.job_conf = [('mapred.textoutputformat.separator',
                          args.kv_separator)]
        args.avro_input = None
        args.avro_output = None

        # despicable hack...
        properties = dict(args.D or [])
        properties.update(dict(args.job_conf))
        output_format = properties.get('mapred.output.format.class',
                                       DEFAULT_OUTPUT_FORMAT)
        if output_format == DEFAULT_OUTPUT_FORMAT:
            if properties['mapred.textoutputformat.separator'] == '':
                pydoop_jar = pydoop.jar_path()
                if pydoop_jar is not None:
                    args.output_format = NOSEP_OUTPUT_FORMAT
                    args.libjars = [pydoop_jar]
                else:
                    warnings.warn(("Can't find pydoop.jar, output will "
                                   "probably be tab-separated"))
        self.args, self.unknown_args = args, unknown_args
        self.zip_filename = zip_filename
Example #6
0
    def convert_args(self, args, unknown_args):
        zip_filename = utils.make_random_str(prefix="pydoop_script_",
                                             postfix='.zip')
        mr_module = utils.make_random_str(prefix="pydoop_script_module_")
        mr_driver = utils.make_random_str(prefix="pydoop_script_driver_")
        with ZipFile(zip_filename, 'w') as zipf:
            zipf.write(args.module, arcname=mr_module+'.py')
            zipf.writestr(mr_driver+'.py',
                          self.generate_driver(mr_module, args))
        if args.python_zip is None:
            args.python_zip = [zip_filename]
        else:
            args.python_zip.append(zip_filename)
        args.module = mr_driver
        args.entry_point = 'main'
        args.program = mr_driver
        args.do_not_use_java_record_reader = False
        args.do_not_use_java_record_writer = False
        args.input_format = None
        args.output_format = None
        args.cache_file = None
        args.cache_archive = None
        args.upload_to_cache = None
        args.libjars = None
        args.mrv2 = pydoop.hadoop_version_info().has_mrv2()
        args.local_fs = False
        args.conf = None
        args.disable_property_name_conversion = True
        args.job_conf = [('mapred.textoutputformat.separator',
                          args.kv_separator)]
        args.avro_input = None
        args.avro_output = None

        # despicable hack...
        properties = dict(args.D or [])
        properties.update(dict(args.job_conf))
        output_format = properties.get('mapred.output.format.class',
                                       DEFAULT_OUTPUT_FORMAT)
        if output_format == DEFAULT_OUTPUT_FORMAT:
            if properties['mapred.textoutputformat.separator'] == '':
                pydoop_jar = pydoop.jar_path()
                if pydoop_jar is not None:
                    args.output_format = NOSEP_OUTPUT_FORMAT
                    args.libjars = [pydoop_jar]
                else:
                    warnings.warn(("Can't find pydoop.jar, output will "
                                   "probably be tab-separated"))
        self.args, self.unknown_args = args, unknown_args
        self.zip_filename = zip_filename
Example #7
0
    def set_args(self, args, unknown_args=None):
        """
        Configure job, based on the arguments provided.
        """
        if unknown_args is None:
            unknown_args = []
        self.logger.setLevel(getattr(logging, args.log_level))

        parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/")))
        self.remote_wd = hdfs.path.join(
            parent, utils.make_random_str(prefix="pydoop_submit_")
        )
        self.remote_exe = hdfs.path.join(self.remote_wd, str(uuid.uuid4()))
        self.properties[JOB_NAME] = args.job_name or 'pydoop'
        self.properties[IS_JAVA_RR] = (
            'false' if args.do_not_use_java_record_reader else 'true'
        )
        self.properties[IS_JAVA_RW] = (
            'false' if args.do_not_use_java_record_writer else 'true'
        )
        self.properties[JOB_REDUCES] = args.num_reducers
        if args.job_name:
            self.properties[JOB_NAME] = args.job_name
        self.properties.update(dict(args.D or []))
        self.properties.update(dict(args.job_conf or []))
        self.__set_files_to_cache(args)
        self.__set_archives_to_cache(args)
        self.requested_env = self._env_arg_to_dict(args.set_env or [])
        self.args = args
        self.unknown_args = unknown_args
Example #8
0
    def set_args(self, args, unknown_args=None):
        """
        Configure job, based on the arguments provided.
        """
        if unknown_args is None:
            unknown_args = []
        self.logger.setLevel(getattr(logging, args.log_level))

        parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/")))
        self.remote_wd = hdfs.path.join(
            parent, utils.make_random_str(prefix="pydoop_submit_"))
        self.remote_exe = hdfs.path.join(self.remote_wd, str(uuid.uuid4()))
        self.properties[JOB_NAME] = args.job_name or 'pydoop'
        self.properties[IS_JAVA_RR] = (
            'false' if args.do_not_use_java_record_reader else 'true')
        self.properties[IS_JAVA_RW] = (
            'false' if args.do_not_use_java_record_writer else 'true')
        self.properties[JOB_REDUCES] = args.num_reducers
        if args.job_name:
            self.properties[JOB_NAME] = args.job_name
        self.properties.update(dict(args.D or []))
        self.properties.update(dict(args.job_conf or []))
        self.__set_files_to_cache(args)
        self.__set_archives_to_cache(args)
        self.requested_env = self._env_arg_to_dict(args.set_env or [])
        self.args = args
        self.unknown_args = unknown_args
Example #9
0
    def set_args(self, args, unknown_args=[]):
        """
        Configure job, based on the arguments provided.
        """
        self.logger.setLevel(getattr(logging, args.log_level))

        parent = hdfs.path.dirname(hdfs.path.abspath(args.output.rstrip("/")))
        self.remote_wd = hdfs.path.join(
            parent, utils.make_random_str(prefix="pydoop_submit_"))
        self.remote_exe = args.program
        self.properties[JOB_NAME] = args.job_name or 'pydoop'
        self.properties[IS_JAVA_RR] = (
            'false' if args.do_not_use_java_record_reader else 'true')
        self.properties[IS_JAVA_RW] = (
            'false' if args.do_not_use_java_record_writer else 'true')
        if args.input_format:
            self.properties[(INPUT_FORMAT_MRV2 if args.mrv2 else
                             INPUT_FORMAT_MRV1)] = args.input_format
        if args.output_format:
            self.properties[(OUTPUT_FORMAT_MRV2 if args.mrv2 else
                             OUTPUT_FORMAT_MRV1)] = args.output_format
        self.properties[JOB_REDUCES] = args.num_reducers
        if args.job_name:
            self.properties[JOB_NAME] = args.job_name
        self.properties.update(dict(args.D or []))
        self.properties.update(dict(args.job_conf or []))
        self.__set_files_to_cache(args)
        self.__set_archives_to_cache(args)
        self.args = args
        self.unknown_args = unknown_args
Example #10
0
 def __init__(self, prefix=None, logger=None):
   self.wd = self.exe = self.input = self.output = None
   self.logger = logger or utils.NullLogger()
   if prefix:
     self.wd = utils.make_random_str(prefix=prefix)
     hdfs.mkdir(self.wd)
     for n in "input", "output":
       setattr(self, n, hdfs.path.join(self.wd, n))
Example #11
0
 def __init__(self, prefix=None, logger=None):
     self.wd = self.exe = self.input = self.output = None
     self.logger = logger or utils.NullLogger()
     if prefix:
         self.wd = utils.make_random_str(prefix=prefix)
         hdfs.mkdir(self.wd)
         for n in "input", "output":
             setattr(self, n, hdfs.path.join(self.wd, n))
Example #12
0
 def set_exe(self, pipes_code):
     """
 Dump launcher code to the distributed file system.
 """
     if not self.output:
         raise RuntimeError("no output directory, can't create launcher")
     parent = hdfs.path.dirname(hdfs.path.abspath(self.output.rstrip("/")))
     self.exe = hdfs.path.join(parent, utils.make_random_str())
     hdfs.dump(pipes_code, self.exe)
Example #13
0
 def set_exe(self, pipes_code):
   """
   Dump launcher code to the distributed file system.
   """
   if not self.output:
     raise RuntimeError("no output directory, can't create launcher")
   parent = hdfs.path.dirname(hdfs.path.abspath(self.output.rstrip("/")))
   self.exe = hdfs.path.join(parent, utils.make_random_str())
   hdfs.dump(pipes_code, self.exe)
Example #14
0
def mapper(_, record, writer, conf):
    out_dir = conf.get('out.dir', utils.make_random_str())
    if not hdfs.path.isdir(out_dir):
        hdfs.mkdir(out_dir)
        hdfs.chmod(out_dir, 'g+rwx')
    img_path = record.strip()
    a = get_array(img_path)
    out_a = calc_features(a)
    out_path = hdfs.path.join(out_dir, '%s.out' % hdfs.path.basename(img_path))
    with hdfs.open(out_path, 'w') as fo:
        np.save(fo, out_a)  # actual output
    hdfs.chmod(out_path, 'g+rw')
    writer.emit(img_path, fo.name)  # info (tab-separated input-output)
Example #15
0
 def test_isdir(self):
   path = utils.make_random_str()
   self.assertFalse(hdfs.path.isdir(path))
   try:
     hdfs.dump("foo\n", path)
     self.assertFalse(hdfs.path.isdir(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertTrue(hdfs.path.isdir(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass
Example #16
0
 def test_kind(self):
   path = utils.make_random_str()
   self.assertTrue(hdfs.path.kind(path) is None)
   try:
     hdfs.dump("foo\n", path)
     self.assertEqual('file', hdfs.path.kind(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertEqual('directory', hdfs.path.kind(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass
Example #17
0
 def test_kind(self):
   path = utils.make_random_str()
   self.assertTrue(hdfs.path.kind(path) is None)
   try:
     hdfs.dump("foo\n", path)
     self.assertEqual('file', hdfs.path.kind(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertEqual('directory', hdfs.path.kind(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass
Example #18
0
 def test_isdir(self):
   path = utils.make_random_str()
   self.assertFalse(hdfs.path.isdir(path))
   try:
     hdfs.dump("foo\n", path)
     self.assertFalse(hdfs.path.isdir(path))
     hdfs.rmr(path)
     hdfs.mkdir(path)
     self.assertTrue(hdfs.path.isdir(path))
   finally:
     try:
       hdfs.rmr(path)
     except IOError:
       pass
Example #19
0
 def good(self):
   path = utils.make_random_str()
   hdfs.dump("foo\n", path)
   self.assertTrue(hdfs.path.exists(path))
   hdfs.rmr(path)
   self.assertFalse(hdfs.path.exists(path))
Example #20
0
 def good(self):
   path = utils.make_random_str()
   hdfs.dump("foo\n", path)
   self.assertTrue(hdfs.path.exists(path))
   hdfs.rmr(path)
   self.assertFalse(hdfs.path.exists(path))
Example #21
0
    def convert_args(self, args, unknown_args):
        # Create a zip archive containing all we need to run the
        # script (including the script itself.  We use
        # NamedTemporaryFile which will take care of deleting the temp
        # archive once we're done
        self.script_archive = NamedTemporaryFile(
            prefix="pydoop_script_",
            suffix='.zip'
        )
        zip_filename = self.script_archive.name
        # Create a one-off temporary file name to avoid name clashes
        # in the distcache.  Keep the same module extension -- it may
        # be a source file or a byte-compiled file
        mr_module = utils.make_random_str(
            prefix="pydoop_script_module_",
            postfix=os.path.basename(args.module)
        )
        mr_driver = utils.make_random_str(prefix="pydoop_script_driver_")
        with ZipFile(zip_filename, 'w') as zipf:
            zipf.write(args.module, arcname=mr_module)
            zipf.writestr(
                mr_driver + '.py',
                self.generate_driver(os.path.splitext(mr_module)[0], args)
            )
        if args.python_zip is None:
            args.python_zip = [zip_filename]
        else:
            args.python_zip.append(zip_filename)
        args.module = mr_driver
        args.entry_point = 'main'
        args.program = mr_driver
        args.do_not_use_java_record_reader = False
        args.do_not_use_java_record_writer = False
        args.output_format = None
        args.cache_file = None
        args.cache_archive = None
        args.upload_to_cache = None
        args.libjars = None
        args.conf = None
        args.disable_property_name_conversion = True
        args.job_conf = [('mapred.textoutputformat.separator',
                          args.kv_separator)]
        args.avro_input = None
        args.avro_output = None
        args.keep_wd = False
        args.pstats_dir = None
        args.pstats_fmt = None

        # despicable hack...
        properties = dict(args.D or [])
        properties.update(dict(args.job_conf))
        output_format = properties.get('mapred.output.format.class',
                                       DEFAULT_OUTPUT_FORMAT)
        if output_format == DEFAULT_OUTPUT_FORMAT:
            if properties['mapred.textoutputformat.separator'] == '':
                pydoop_jar = pydoop.jar_path()
                if pydoop_jar is not None:
                    args.output_format = NOSEP_OUTPUT_FORMAT
                    args.libjars = [pydoop_jar]
                else:
                    warnings.warn(("Can't find pydoop.jar, output will "
                                   "probably be tab-separated"))
        self.args, self.unknown_args = args, unknown_args