Exemple #1
0
    def __init__(self):
        self.parser = SeqalConfig()

        # set default properties
        self.properties = {
            self.ConfLogLevel: self.DefaultLogLevel,
            'mapred.create.symlink': 'yes',
            'mapred.compress.map.output': 'true',
        }
        self.hdfs = None
        self.options = None
        self.left_over_args = None
        self.logger = None
Exemple #2
0
    def __init__(self):
        self.parser = SeqalConfig()

        # set default properties
        self.properties = {
            self.ConfLogLevel: self.DefaultLogLevel,
            'hadoop.pipes.java.recordreader': 'true',
            'hadoop.pipes.java.recordwriter': 'true',
            'mapred.create.symlink': 'yes',
            'mapred.compress.map.output': 'true',
            'bl.libhdfs.opts': '-Xmx48m'
        }
        self.hdfs = None
        self.options = None
Exemple #3
0
    def __init__(self):
        self.parser = SeqalConfig()

        # set default properties
        self.properties = {
            self.ConfLogLevel: self.DefaultLogLevel,
            'mapred.create.symlink': 'yes',
            'mapred.compress.map.output': 'true',
        }
        self.hdfs = None
        self.options = None
        self.left_over_args = None
        self.logger = None
Exemple #4
0
    def __init__(self):
        self.parser = SeqalConfig()

        # set default properties
        self.properties = {
            self.ConfLogLevel: self.DefaultLogLevel,
            'hadoop.pipes.java.recordreader': 'true',
            'hadoop.pipes.java.recordwriter': 'true',
            'mapred.create.symlink': 'yes',
            'mapred.compress.map.output': 'true',
            'bl.libhdfs.opts': '-Xmx48m'
        }
        self.hdfs = None
        self.options = None
Exemple #5
0
class SeqalRun(object):

    DefaultReduceTasksPerNode = 6
    LogName = "seqal"
    DefaultLogLevel = 'INFO'

    ConfLogLevel = 'seal.seqal.log.level'
    ConfLogLevel_deprecated = 'bl.seqal.log.level'

    def __init__(self):
        self.parser = SeqalConfig()

        # set default properties
        self.properties = {
            self.ConfLogLevel: self.DefaultLogLevel,
            'hadoop.pipes.java.recordreader': 'true',
            'hadoop.pipes.java.recordwriter': 'true',
            'mapred.create.symlink': 'yes',
            'mapred.compress.map.output': 'true',
            'bl.libhdfs.opts': '-Xmx48m'
        }
        self.hdfs = None
        self.options = None

    def parse_cmd_line(self):
        self.options, self.left_over_args = self.parser.load_config_and_cmd_line(
        )

        # set the job name.  Do it here so the user can override it
        self.properties[
            'mapred.job.name'] = 'seqal_aln_%s' % self.options.output

        # now collect the property values specified in the options and
        # copy them to properties
        for k, v in self.options.properties.iteritems():
            self.properties[k] = v

        # create a logger
        logging.basicConfig()
        self.logger = logging.getLogger(self.__class__.LogName)
        # temporarily set to a high logging level in case we have to print warnings
        # regarding deprecated properties
        self.logger.setLevel(logging.DEBUG)
        # warn for deprecated bl.seqal.log.level property
        if self.properties.has_key(self.ConfLogLevel_deprecated):
            deprecation.deprecation_warning(self.logger,
                                            self.ConfLogLevel_deprecated,
                                            self.ConfLogLevel)
            if self.properties[self.ConfLogLevel] == self.DefaultLogLevel and \
                self.properties[self.ConfLogLevel_deprecated] != self.DefaultLogLevel:
                # the deprecated property is different from default, while the new property is not.  Therefore,
                # the user has set the deprecated property to a new value.  We'll use that one.
                self.properties[self.ConfLogLevel] = self.properties[
                    self.ConfLogLevel_deprecated]
                self.logger.warning(
                    "Using value %s for property %s (value taken from its deprecated equivalent property %s).",
                    self.properties[self.ConfLogLevel], self.ConfLogLevel,
                    self.ConfLogLevel_deprecated)

        # Set proper logging level
        log_level = getattr(logging, self.properties['seal.seqal.log.level'],
                            None)
        if log_level is None:
            self.logger.setLevel(logging.DEBUG)
            self.logger.warning(
                "Invalid configuration value '%s' for %s.  Check your configuration.",
                self.ConfLogLevel, self.properties['seal.seqal.log.level'])
            self.logger.warning("Falling back to DEBUG")
            self.logger.warning(
                "Valid values for seal.seqal.log.level are: DEBUG, INFO, WARNING, ERROR, CRITICAL; default: %s",
                SeqalRun.DefaultLogLevel)
        else:
            self.logger.setLevel(log_level)

        # reference
        self.properties[
            'mapred.cache.archives'] = '%s#reference' % self.options.reference

        # set the number of reduce tasks
        if self.options.align_only:
            n_red_tasks = 0

            if self.options.num_reducers and self.options.num_reducers > 0:
                self.logger.warning(
                    "Number of reduce tasks must be 0 when doing --align-only."
                )
                self.logger.warning("Ignoring request for %d reduce tasks",
                                    self.options.num_reducers)
        elif self.options.num_reducers:
            n_red_tasks = self.options.num_reducers
        else:
            n_red_tasks = SeqalRun.DefaultReduceTasksPerNode * hadut.num_nodes(
            )

        self.properties['mapred.reduce.tasks'] = n_red_tasks

    def __write_pipes_script(self, fd):
        ld_path = ":".join(
            filter(lambda x: x, [os.environ.get('LD_LIBRARY_PATH', None)]))
        pypath = os.environ.get('PYTHONPATH', '')
        self.logger.debug("LD_LIBRARY_PATH for tasks: %s", ld_path)
        self.logger.debug("PYTHONPATH for tasks: %s", pypath)

        fd.write("#!/bin/bash\n")
        fd.write('""":"\n')
        # should we set HOME to ~?  Hadoop by default sets $HOME to /homes, unless the
        # cluster administrator sets mapreduce.admin.user.home.dir.  This kills local installations
        #fd.write('[ -d "${HOME}" ] || export HOME="$(echo ~)"\n')
        # which causes python not to add installations under ~/.local/ to the PYTHONPATH
        fd.write(
            'export LD_LIBRARY_PATH="%s" # Seal dir + LD_LIBRARY_PATH copied from the env where you ran %s\n'
            % (ld_path, sys.argv[0]))
        fd.write('export PYTHONPATH="%s"\n' % pypath)
        if self.logger.isEnabledFor(logging.DEBUG):
            fd.write('env >&2\n')  # write the environment to the stderr log
            fd.write(
                'echo >&2; cat $0 >&2\n')  # write the script to the stderr log
        fd.write('exec "%s" -u "$0" "$@"\n' % sys.executable)
        fd.write('":"""\n')
        script = """
import sys
try:
  from seal.seqal import run_task
  run_task()
except ImportError as e:
  sys.stderr.write(str(e) + "\\n")
  sys.stderr.write("Can't import seal module\\n")
  sys.stderr.write("Did you install Seal to a system path on all the nodes?\\n")
  sys.stderr.write("If you installed to a non-system path (e.g. your home directory)\\n")
  sys.stderr.write("you'll have to set PYTHONPATH to point to it.\\n")
  sys.stderr.write("Current Python library paths:\\n")
  sys.stderr.write("  sys.path: %s:\\n" % ':'.join(map(str, sys.path)))
"""
        fd.write(script)

    def run(self):
        if self.options is None:
            raise RuntimeError("You must call parse_cmd_line before run")

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("Running Seqal")
            self.logger.debug(
                "Properties:\n%s", "\n".join(
                    sorted([
                        "%s = %s" % (str(k), str(v))
                        for k, v in self.properties.iteritems()
                    ])))
        self.logger.info("Input: %s; Output: %s; reference: %s",
                         self.options.input, self.options.output,
                         self.options.reference)

        try:
            self.hdfs = phdfs.hdfs('default', 0)
            self.__validate()

            self.remote_bin_name = tempfile.mktemp(prefix='seqal_bin.',
                                                   suffix=str(random.random()),
                                                   dir='')
            try:
                with self.hdfs.open_file(self.remote_bin_name, 'w') as script:
                    self.__write_pipes_script(script)

                full_name = self.hdfs.get_path_info(
                    self.remote_bin_name)['name']

                return hadut.run_pipes(full_name,
                                       self.options.input,
                                       self.options.output,
                                       properties=self.properties,
                                       args_list=self.left_over_args)
            finally:
                try:
                    self.hdfs.delete(
                        self.remote_bin_name
                    )  # delete the temporary pipes script from HDFS
                    self.logger.debug("pipes script %s deleted",
                                      self.remote_bin_name)
                except:
                    self.logger.error(
                        "Error deleting the temporary pipes script %s from HDFS",
                        self.remote_bin_name)
                    ## don't re-raise the exception.  We're on our way out
        finally:
            if self.hdfs:
                tmp = self.hdfs
                self.hdfs = None
                tmp.close()
                self.logger.debug("HDFS closed")

    def __validate(self):
        if self.properties['mapred.reduce.tasks'] == 0:
            self.logger.info("Running in alignment-only mode (no rmdup).")

        # validate conditions
        if phdfs.path.exists(self.options.output):
            raise SeqalConfigError(
                "Output directory %s already exists.  Please delete it or specify a different output directory."
                % self.options.output)
        if not phdfs.path.exists(self.options.reference):
            raise SeqalConfigError("Can't read reference archive %s" %
                                   self.options.reference)
Exemple #6
0
class SeqalRun(object):

    DefaultReduceTasksPerNode = 6
    LogName = "seqal"
    DefaultLogLevel = 'INFO'

    ConfLogLevel = 'seal.seqal.log.level'
    ConfLogLevel_deprecated = 'bl.seqal.log.level'

    def __init__(self):
        self.parser = SeqalConfig()

        # set default properties
        self.properties = {
            self.ConfLogLevel: self.DefaultLogLevel,
            'hadoop.pipes.java.recordreader': 'true',
            'hadoop.pipes.java.recordwriter': 'true',
            'mapred.create.symlink': 'yes',
            'mapred.compress.map.output': 'true',
            'bl.libhdfs.opts': '-Xmx48m'
        }
        self.hdfs = None
        self.options = None

    def parse_cmd_line(self, args):
        self.options, self.left_over_args = self.parser.load_config_and_cmd_line(args)

        # set the job name.  Do it here so the user can override it
        self.properties['mapred.job.name'] = 'seqal_aln_%s' % self.options.output

        # now collect the property values specified in the options and
        # copy them to properties
        for k,v in self.options.properties.iteritems():
            self.properties[k] = v

        # create a logger
        logging.basicConfig()
        self.logger = logging.getLogger(self.__class__.LogName)
        # temporarily set to a high logging level in case we have to print warnings
        # regarding deprecated properties
        self.logger.setLevel(logging.DEBUG)
        # warn for deprecated bl.seqal.log.level property
        if self.properties.has_key(self.ConfLogLevel_deprecated):
            deprecation.deprecation_warning(self.logger, self.ConfLogLevel_deprecated, self.ConfLogLevel)
            if self.properties[self.ConfLogLevel] == self.DefaultLogLevel and \
                self.properties[self.ConfLogLevel_deprecated] != self.DefaultLogLevel:
                # the deprecated property is different from default, while the new property is not.  Therefore,
                # the user has set the deprecated property to a new value.  We'll use that one.
                self.properties[self.ConfLogLevel] = self.properties[self.ConfLogLevel_deprecated]
                self.logger.warning("Using value %s for property %s (value taken from its deprecated equivalent property %s).",
                    self.properties[self.ConfLogLevel], self.ConfLogLevel, self.ConfLogLevel_deprecated)

        # Set proper logging level
        log_level = getattr(logging, self.properties['seal.seqal.log.level'], None)
        if log_level is None:
            self.logger.setLevel(logging.DEBUG)
            self.logger.warning("Invalid configuration value '%s' for %s.  Check your configuration.", self.ConfLogLevel, self.properties['seal.seqal.log.level'])
            self.logger.warning("Falling back to DEBUG")
            self.logger.warning("Valid values for seal.seqal.log.level are: DEBUG, INFO, WARNING, ERROR, CRITICAL; default: %s", SeqalRun.DefaultLogLevel)
        else:
            self.logger.setLevel(log_level)

        # reference
        self.properties['mapred.cache.archives'] = '%s#reference' % self.options.reference

        # set the number of reduce tasks
        if self.options.align_only:
            n_red_tasks = 0

            if self.options.num_reducers and self.options.num_reducers > 0:
                self.logger.warning("Number of reduce tasks must be 0 when doing --align-only.")
                self.logger.warning("Ignoring request for %d reduce tasks", self.options.num_reducers)
        elif self.options.num_reducers:
            n_red_tasks = self.options.num_reducers
        else:
            n_red_tasks = SeqalRun.DefaultReduceTasksPerNode * hadut.get_num_nodes()

        self.properties['mapred.reduce.tasks'] = n_red_tasks

    def __write_pipes_script(self, fd):
        ld_path = ":".join( filter(lambda x:x, [os.environ.get('LD_LIBRARY_PATH', None)]) )
        pypath = os.environ.get('PYTHONPATH', '')
        self.logger.debug("LD_LIBRARY_PATH for tasks: %s", ld_path)
        self.logger.debug("PYTHONPATH for tasks: %s", pypath)

        fd.write("#!/bin/bash\n")
        fd.write('""":"\n')
        # should we set HOME to ~?  Hadoop by default sets $HOME to /homes, unless the
        # cluster administrator sets mapreduce.admin.user.home.dir.  This kills local installations
        #fd.write('[ -d "${HOME}" ] || export HOME="$(echo ~)"\n')
        # which causes python not to add installations under ~/.local/ to the PYTHONPATH
        fd.write('export LD_LIBRARY_PATH="%s" # Seal dir + LD_LIBRARY_PATH copied from the env where you ran %s\n' % (ld_path, sys.argv[0]))
        fd.write('export PYTHONPATH="%s"\n' % pypath)
        if self.logger.isEnabledFor(logging.DEBUG):
            fd.write('env >&2\n') # write the environment to the stderr log
            fd.write('echo >&2; cat $0 >&2\n') # write the script to the stderr log
        fd.write('exec "%s" -u "$0" "$@"\n' % sys.executable)
        fd.write('":"""\n')
        script = """
import sys
try:
  from seal.seqal import run_task
  run_task()
except ImportError as e:
  sys.stderr.write(str(e) + "\\n")
  sys.stderr.write("Can't import seal module\\n")
  sys.stderr.write("Did you install Seal to a system path on all the nodes?\\n")
  sys.stderr.write("If you installed to a non-system path (e.g. your home directory)\\n")
  sys.stderr.write("you'll have to set PYTHONPATH to point to it.\\n")
  sys.stderr.write("Current Python library paths:\\n")
  sys.stderr.write("  sys.path: %s:\\n" % ':'.join(map(str, sys.path)))
"""
        fd.write(script)

    def run(self):
        if self.options is None:
            raise RuntimeError("You must call parse_cmd_line before run")

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("Running Seqal")
            self.logger.debug("Properties:\n%s", "\n".join( sorted([ "%s = %s" % (str(k), str(v)) for k,v in self.properties.iteritems() ]) ))
        self.logger.info("Input: %s; Output: %s; reference: %s", self.options.input, self.options.output, self.options.reference)

        try:
            self.hdfs = phdfs.hdfs('default', 0)
            self.__validate()

            self.remote_bin_name = tempfile.mktemp(prefix='seqal_bin.', suffix=str(random.random()), dir='')
            try:
                with self.hdfs.open_file(self.remote_bin_name, 'w') as script:
                    self.__write_pipes_script(script)

                full_name = self.hdfs.get_path_info(self.remote_bin_name)['name']

                return seal_utilities.run_pipes(full_name, self.options.input, self.options.output,
                    properties=self.properties, args_list=self.left_over_args)
            finally:
                try:
                    self.hdfs.delete(self.remote_bin_name) # delete the temporary pipes script from HDFS
                    self.logger.debug("pipes script %s deleted", self.remote_bin_name)
                except:
                    self.logger.error("Error deleting the temporary pipes script %s from HDFS", self.remote_bin_name)
                    ## don't re-raise the exception.  We're on our way out
        finally:
            if self.hdfs:
                tmp = self.hdfs
                self.hdfs = None
                tmp.close()
                self.logger.debug("HDFS closed")

    def __validate(self):
        if self.properties['mapred.reduce.tasks'] == 0:
            self.logger.info("Running in alignment-only mode (no rmdup).")

        # validate conditions
        if phdfs.path.exists(self.options.output):
            raise SeqalConfigError("Output directory %s already exists.  Please delete it or specify a different output directory." % self.options.output)
        if not phdfs.path.exists(self.options.reference):
            raise SeqalConfigError("Can't read reference archive %s" % self.options.reference)
Exemple #7
0
class SeqalSubmit(object):

    DefaultReduceTasksPerNode = 6
    LogName = "seqal"
    DefaultLogLevel = 'INFO'

    ConfLogLevel = 'seal.seqal.log.level'
    ConfLogLevel_deprecated = 'bl.seqal.log.level'

    def __init__(self):
        self.parser = SeqalConfig()

        # set default properties
        self.properties = {
            self.ConfLogLevel: self.DefaultLogLevel,
            'mapred.create.symlink': 'yes',
            'mapred.compress.map.output': 'true',
        }
        self.hdfs = None
        self.options = None
        self.left_over_args = None
        self.logger = None

    def parse_cmd_line(self, args):
        self.options, self.left_over_args = self.parser.load_config_and_cmd_line(
            args)

        # set the job name.  Do it here so the user can override it
        self.properties[
            'mapred.job.name'] = 'seqal_aln_%s' % self.options.output

        # now collect the property values specified in the options and
        # copy them to properties
        for k, v in self.options.properties.iteritems():
            self.properties[k] = v

        # create a logger
        logging.basicConfig()
        self.logger = logging.getLogger(self.__class__.LogName)
        # temporarily set to a high logging level in case we have to print warnings
        # regarding deprecated properties
        self.logger.setLevel(logging.DEBUG)
        # warn for deprecated bl.seqal.log.level property
        if self.properties.has_key(self.ConfLogLevel_deprecated):
            deprecation.deprecation_warning(self.logger,
                                            self.ConfLogLevel_deprecated,
                                            self.ConfLogLevel)
            if self.properties[self.ConfLogLevel] == self.DefaultLogLevel and \
                self.properties[self.ConfLogLevel_deprecated] != self.DefaultLogLevel:
                # the deprecated property is different from default, while the new property is not.  Therefore,
                # the user has set the deprecated property to a new value.  We'll use that one.
                self.properties[self.ConfLogLevel] = self.properties[
                    self.ConfLogLevel_deprecated]
                self.logger.warning(
                    "Using value %s for property %s (value taken from its deprecated equivalent property %s).",
                    self.properties[self.ConfLogLevel], self.ConfLogLevel,
                    self.ConfLogLevel_deprecated)

        # Set proper logging level
        log_level = getattr(logging, self.properties['seal.seqal.log.level'],
                            None)
        if log_level is None:
            self.logger.setLevel(logging.DEBUG)
            self.logger.warning(
                "Invalid configuration value '%s' for %s.  Check your configuration.",
                self.ConfLogLevel, self.properties['seal.seqal.log.level'])
            self.logger.warning("Falling back to DEBUG")
            self.logger.warning(
                "Valid values for seal.seqal.log.level are: DEBUG, INFO, WARNING, ERROR, CRITICAL; default: %s",
                SeqalSubmit.DefaultLogLevel)
        else:
            self.logger.setLevel(log_level)

        # reference
        self.properties[
            'mapred.cache.archives'] = '%s#reference' % self.options.reference

        # set the number of reduce tasks
        if self.options.align_only:
            n_red_tasks = 0
            if self.options.num_reducers and self.options.num_reducers > 0:
                self.logger.warning(
                    "Number of reduce tasks must be 0 when doing --align-only."
                )
                self.logger.warning("Ignoring request for %d reduce tasks",
                                    self.options.num_reducers)
        elif self.options.num_reducers:
            n_red_tasks = self.options.num_reducers
        else:
            n_red_tasks = SeqalSubmit.DefaultReduceTasksPerNode * hadut.get_num_nodes(
            )

        self.properties['mapred.reduce.tasks'] = n_red_tasks

    def run(self):
        self.logger.setLevel(logging.DEBUG)
        if self.options is None:
            raise RuntimeError("You must call parse_cmd_line before run")

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("Running Seqal")
            self.logger.debug(
                "Properties:\n%s", "\n".join(
                    sorted([
                        "%s = %s" % (str(k), str(v))
                        for k, v in self.properties.iteritems()
                    ])))
        self.logger.info("Input: %s; Output: %s; reference: %s",
                         self.options.input, self.options.output,
                         self.options.reference)

        pydoop_argv = ['submit']

        # some properties have "pydoop submit" command line arguments which should be preferred
        if self.properties.has_key('mapred.job.name'):
            pydoop_argv.extend(
                ('--job-name', self.properties.pop('mapred.job.name')))
        if self.properties.has_key('mapred.reduce.tasks'):
            pydoop_argv.extend(
                ('--num-reducers',
                 str(self.properties.pop('mapred.reduce.tasks'))))
        if self.properties.has_key('mapred.cache.archives'):
            pydoop_argv.extend(('--cache-archive',
                                self.properties.pop('mapred.cache.archives')))

        pydoop_argv.extend("-D{}={}".format(k, v)
                           for k, v in self.properties.iteritems())

        pydoop_argv.append('seal.seqal.seqal_run')
        pydoop_argv.extend(('--entry-point', 'run_job'))
        pydoop_argv.extend(self.left_over_args)
        pydoop_argv.append(self.options.input)
        pydoop_argv.append(self.options.output)

        self.logger.debug("Calling pydoop.app.main with these args:")
        self.logger.debug(pydoop_argv)
        self.logger.info("Lauching job")
        pydoop_main(pydoop_argv)
        self.logger.info("finished")

    def __validate(self):
        if self.properties['mapred.reduce.tasks'] == 0:
            self.logger.info("Running in alignment-only mode (no rmdup).")

        # validate conditions
        if phdfs.path.exists(self.options.output):
            raise SeqalConfigError(
                "Output directory %s already exists.  "
                "Please delete it or specify a different output directory." %
                self.options.output)
        if not phdfs.path.exists(self.options.reference):
            raise SeqalConfigError("Can't read reference archive %s" %
                                   self.options.reference)
Exemple #8
0
class SeqalSubmit(object):

    DefaultReduceTasksPerNode = 6
    LogName = "seqal"
    DefaultLogLevel = 'INFO'

    ConfLogLevel = 'seal.seqal.log.level'
    ConfLogLevel_deprecated = 'bl.seqal.log.level'

    def __init__(self):
        self.parser = SeqalConfig()

        # set default properties
        self.properties = {
            self.ConfLogLevel: self.DefaultLogLevel,
            'mapred.create.symlink': 'yes',
            'mapred.compress.map.output': 'true',
        }
        self.hdfs = None
        self.options = None
        self.left_over_args = None
        self.logger = None

    def parse_cmd_line(self, args):
        self.options, self.left_over_args = self.parser.load_config_and_cmd_line(args)

        # set the job name.  Do it here so the user can override it
        self.properties['mapred.job.name'] = 'seqal_aln_%s' % self.options.output

        # now collect the property values specified in the options and
        # copy them to properties
        for k,v in self.options.properties.iteritems():
            self.properties[k] = v

        # create a logger
        logging.basicConfig()
        self.logger = logging.getLogger(self.__class__.LogName)
        # temporarily set to a high logging level in case we have to print warnings
        # regarding deprecated properties
        self.logger.setLevel(logging.DEBUG)
        # warn for deprecated bl.seqal.log.level property
        if self.properties.has_key(self.ConfLogLevel_deprecated):
            deprecation.deprecation_warning(self.logger, self.ConfLogLevel_deprecated, self.ConfLogLevel)
            if self.properties[self.ConfLogLevel] == self.DefaultLogLevel and \
                self.properties[self.ConfLogLevel_deprecated] != self.DefaultLogLevel:
                # the deprecated property is different from default, while the new property is not.  Therefore,
                # the user has set the deprecated property to a new value.  We'll use that one.
                self.properties[self.ConfLogLevel] = self.properties[self.ConfLogLevel_deprecated]
                self.logger.warning("Using value %s for property %s (value taken from its deprecated equivalent property %s).",
                    self.properties[self.ConfLogLevel], self.ConfLogLevel, self.ConfLogLevel_deprecated)

        # Set proper logging level
        log_level = getattr(logging, self.properties['seal.seqal.log.level'], None)
        if log_level is None:
            self.logger.setLevel(logging.DEBUG)
            self.logger.warning("Invalid configuration value '%s' for %s.  Check your configuration.",
                    self.ConfLogLevel, self.properties['seal.seqal.log.level'])
            self.logger.warning("Falling back to DEBUG")
            self.logger.warning("Valid values for seal.seqal.log.level are: DEBUG, INFO, WARNING, ERROR, CRITICAL; default: %s",
                    SeqalSubmit.DefaultLogLevel)
        else:
            self.logger.setLevel(log_level)

        # reference
        self.properties['mapred.cache.archives'] = '%s#reference' % self.options.reference

        # set the number of reduce tasks
        if self.options.align_only:
            n_red_tasks = 0
            if self.options.num_reducers and self.options.num_reducers > 0:
                self.logger.warning("Number of reduce tasks must be 0 when doing --align-only.")
                self.logger.warning("Ignoring request for %d reduce tasks", self.options.num_reducers)
        elif self.options.num_reducers:
            n_red_tasks = self.options.num_reducers
        else:
            n_red_tasks = SeqalSubmit.DefaultReduceTasksPerNode * hadut.get_num_nodes()

        self.properties['mapred.reduce.tasks'] = n_red_tasks

    def run(self):
        self.logger.setLevel(logging.DEBUG)
        if self.options is None:
            raise RuntimeError("You must call parse_cmd_line before run")

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("Running Seqal")
            self.logger.debug("Properties:\n%s",
                    "\n".join( sorted([ "%s = %s" % (str(k), str(v)) for k,v in self.properties.iteritems() ]) ))
        self.logger.info("Input: %s; Output: %s; reference: %s",
                self.options.input, self.options.output, self.options.reference)

        pydoop_argv = [ 'submit' ]

        # some properties have "pydoop submit" command line arguments which should be preferred
        if self.properties.has_key('mapred.job.name'):
            pydoop_argv.extend( ('--job-name', self.properties.pop('mapred.job.name')) )
        if self.properties.has_key('mapred.reduce.tasks'):
            pydoop_argv.extend( ('--num-reducers', str(self.properties.pop('mapred.reduce.tasks'))) )
        if self.properties.has_key('mapred.cache.archives'):
            pydoop_argv.extend( ('--cache-archive', self.properties.pop('mapred.cache.archives')) )

        pydoop_argv.extend( "-D{}={}".format(k, v) for k, v in self.properties.iteritems() )

        pydoop_argv.append('seal.seqal.seqal_run')
        pydoop_argv.extend( ('--entry-point', 'run_job' ))
        pydoop_argv.extend(self.left_over_args)
        pydoop_argv.append(self.options.input)
        pydoop_argv.append(self.options.output)

        self.logger.debug("Calling pydoop.app.main with these args:")
        self.logger.debug(pydoop_argv)
        self.logger.info("Lauching job")
        pydoop_main(pydoop_argv)
        self.logger.info("finished")

    def __validate(self):
        if self.properties['mapred.reduce.tasks'] == 0:
            self.logger.info("Running in alignment-only mode (no rmdup).")

        # validate conditions
        if phdfs.path.exists(self.options.output):
            raise SeqalConfigError(
                    "Output directory %s already exists.  "
                    "Please delete it or specify a different output directory." % self.options.output)
        if not phdfs.path.exists(self.options.reference):
            raise SeqalConfigError("Can't read reference archive %s" % self.options.reference)