Exemple #1
0
    def parse_cmd_line(self, args):
        self.options, self.left_over_args = self.parser.load_config_and_cmd_line(args)

        # set the job name.  Do it here so the user can override it
        self.properties['mapred.job.name'] = 'seqal_aln_%s' % self.options.output

        # now collect the property values specified in the options and
        # copy them to properties
        for k,v in self.options.properties.iteritems():
            self.properties[k] = v

        # create a logger
        logging.basicConfig()
        self.logger = logging.getLogger(self.__class__.LogName)
        # temporarily set to a high logging level in case we have to print warnings
        # regarding deprecated properties
        self.logger.setLevel(logging.DEBUG)
        # warn for deprecated bl.seqal.log.level property
        if self.properties.has_key(self.ConfLogLevel_deprecated):
            deprecation.deprecation_warning(self.logger, self.ConfLogLevel_deprecated, self.ConfLogLevel)
            if self.properties[self.ConfLogLevel] == self.DefaultLogLevel and \
                self.properties[self.ConfLogLevel_deprecated] != self.DefaultLogLevel:
                # the deprecated property is different from default, while the new property is not.  Therefore,
                # the user has set the deprecated property to a new value.  We'll use that one.
                self.properties[self.ConfLogLevel] = self.properties[self.ConfLogLevel_deprecated]
                self.logger.warning("Using value %s for property %s (value taken from its deprecated equivalent property %s).",
                    self.properties[self.ConfLogLevel], self.ConfLogLevel, self.ConfLogLevel_deprecated)

        # Set proper logging level
        log_level = getattr(logging, self.properties['seal.seqal.log.level'], None)
        if log_level is None:
            self.logger.setLevel(logging.DEBUG)
            self.logger.warning("Invalid configuration value '%s' for %s.  Check your configuration.",
                    self.ConfLogLevel, self.properties['seal.seqal.log.level'])
            self.logger.warning("Falling back to DEBUG")
            self.logger.warning("Valid values for seal.seqal.log.level are: DEBUG, INFO, WARNING, ERROR, CRITICAL; default: %s",
                    SeqalSubmit.DefaultLogLevel)
        else:
            self.logger.setLevel(log_level)

        # reference
        self.properties['mapred.cache.archives'] = '%s#reference' % self.options.reference

        # set the number of reduce tasks
        if self.options.align_only:
            n_red_tasks = 0
            if self.options.num_reducers and self.options.num_reducers > 0:
                self.logger.warning("Number of reduce tasks must be 0 when doing --align-only.")
                self.logger.warning("Ignoring request for %d reduce tasks", self.options.num_reducers)
        elif self.options.num_reducers:
            n_red_tasks = self.options.num_reducers
        else:
            n_red_tasks = SeqalSubmit.DefaultReduceTasksPerNode * hadut.get_num_nodes()

        self.properties['mapred.reduce.tasks'] = n_red_tasks
Exemple #2
0
import argparse
import logging
import uuid
logging.basicConfig(level=logging.INFO)

import pydoop
import pydoop.hdfs as hdfs
import pydoop.hadut as hadut
import pydoop.utils as utils
import pydoop.utils.conversion_tables as conv_tables
from pydoop.mapreduce.pipes import PSTATS_DIR, PSTATS_FMT

from .argparse_types import kv_pair, a_file_that_can_be_read
from .argparse_types import a_comma_separated_list, a_hdfs_file

DEFAULT_REDUCE_TASKS = max(3 * hadut.get_num_nodes(offline=True), 1)
DEFAULT_ENTRY_POINT = '__main__'
IS_JAVA_RR = "hadoop.pipes.java.recordreader"
IS_JAVA_RW = "hadoop.pipes.java.recordwriter"
CACHE_FILES = "mapred.cache.files"
CACHE_ARCHIVES = "mapred.cache.archives"
USER_HOME = "mapreduce.admin.user.home.dir"
JOB_REDUCES = "mapred.reduce.tasks"
JOB_NAME = "mapred.job.name"
COMPRESS_MAP_OUTPUT = "mapred.compress.map.output"
AVRO_IO_CHOICES = ['k', 'v', 'kv']
AVRO_IO_CHOICES += [_.upper() for _ in AVRO_IO_CHOICES]


class PydoopSubmitter(object):
    """
Exemple #3
0
    writer = ContextWriter(ctx)
    %(module)s.%(combiner_fn)s(key, PydoopScriptCombiner.iter(ctx), writer)

  def with_conf(self, ctx):
    key = ctx.getInputKey()
    writer = ContextWriter(ctx)
    %(module)s.%(combiner_fn)s(key, PydoopScriptReducer.iter(ctx), writer, self.conf)

if __name__ == '__main__':
  result = pydoop.pipes.runTask(pydoop.pipes.Factory(
    PydoopScriptMapper, PydoopScriptReducer, record_reader_class=None,
    record_writer_class=None, combiner_class=%(combiner_wp)s, partitioner_class=None))
  sys.exit(0 if result else 1)
"""

DEFAULT_REDUCE_TASKS = max(3 * hadut.get_num_nodes(offline=True), 1)
DEFAULT_OUTPUT_FORMAT = "org.apache.hadoop.mapred.TextOutputFormat"
NOSEP_OUTPUT_FORMAT = 'it.crs4.pydoop.NoSeparatorTextOutputFormat'


def kv_pair(s):
  return s.split("=", 1)


class PydoopScript(object):

  DESCRIPTION = "Easy MapReduce scripting with Pydoop"

  def __init__(self):
    self.logger = logging.getLogger("PydoopScript")
    self.properties = {
Exemple #4
0
    def parse_cmd_line(self, args):
        self.options, self.left_over_args = self.parser.load_config_and_cmd_line(
            args)

        # set the job name.  Do it here so the user can override it
        self.properties[
            'mapred.job.name'] = 'seqal_aln_%s' % self.options.output

        # now collect the property values specified in the options and
        # copy them to properties
        for k, v in self.options.properties.iteritems():
            self.properties[k] = v

        # create a logger
        logging.basicConfig()
        self.logger = logging.getLogger(self.__class__.LogName)
        # temporarily set to a high logging level in case we have to print warnings
        # regarding deprecated properties
        self.logger.setLevel(logging.DEBUG)
        # warn for deprecated bl.seqal.log.level property
        if self.properties.has_key(self.ConfLogLevel_deprecated):
            deprecation.deprecation_warning(self.logger,
                                            self.ConfLogLevel_deprecated,
                                            self.ConfLogLevel)
            if self.properties[self.ConfLogLevel] == self.DefaultLogLevel and \
                self.properties[self.ConfLogLevel_deprecated] != self.DefaultLogLevel:
                # the deprecated property is different from default, while the new property is not.  Therefore,
                # the user has set the deprecated property to a new value.  We'll use that one.
                self.properties[self.ConfLogLevel] = self.properties[
                    self.ConfLogLevel_deprecated]
                self.logger.warning(
                    "Using value %s for property %s (value taken from its deprecated equivalent property %s).",
                    self.properties[self.ConfLogLevel], self.ConfLogLevel,
                    self.ConfLogLevel_deprecated)

        # Set proper logging level
        log_level = getattr(logging, self.properties['seal.seqal.log.level'],
                            None)
        if log_level is None:
            self.logger.setLevel(logging.DEBUG)
            self.logger.warning(
                "Invalid configuration value '%s' for %s.  Check your configuration.",
                self.ConfLogLevel, self.properties['seal.seqal.log.level'])
            self.logger.warning("Falling back to DEBUG")
            self.logger.warning(
                "Valid values for seal.seqal.log.level are: DEBUG, INFO, WARNING, ERROR, CRITICAL; default: %s",
                SeqalSubmit.DefaultLogLevel)
        else:
            self.logger.setLevel(log_level)

        # reference
        self.properties[
            'mapred.cache.archives'] = '%s#reference' % self.options.reference

        # set the number of reduce tasks
        if self.options.align_only:
            n_red_tasks = 0
            if self.options.num_reducers and self.options.num_reducers > 0:
                self.logger.warning(
                    "Number of reduce tasks must be 0 when doing --align-only."
                )
                self.logger.warning("Ignoring request for %d reduce tasks",
                                    self.options.num_reducers)
        elif self.options.num_reducers:
            n_red_tasks = self.options.num_reducers
        else:
            n_red_tasks = SeqalSubmit.DefaultReduceTasksPerNode * hadut.get_num_nodes(
            )

        self.properties['mapred.reduce.tasks'] = n_red_tasks