def parse_cmd_line(self, args): self.options, self.left_over_args = self.parser.load_config_and_cmd_line(args) # set the job name. Do it here so the user can override it self.properties['mapred.job.name'] = 'seqal_aln_%s' % self.options.output # now collect the property values specified in the options and # copy them to properties for k,v in self.options.properties.iteritems(): self.properties[k] = v # create a logger logging.basicConfig() self.logger = logging.getLogger(self.__class__.LogName) # temporarily set to a high logging level in case we have to print warnings # regarding deprecated properties self.logger.setLevel(logging.DEBUG) # warn for deprecated bl.seqal.log.level property if self.properties.has_key(self.ConfLogLevel_deprecated): deprecation.deprecation_warning(self.logger, self.ConfLogLevel_deprecated, self.ConfLogLevel) if self.properties[self.ConfLogLevel] == self.DefaultLogLevel and \ self.properties[self.ConfLogLevel_deprecated] != self.DefaultLogLevel: # the deprecated property is different from default, while the new property is not. Therefore, # the user has set the deprecated property to a new value. We'll use that one. self.properties[self.ConfLogLevel] = self.properties[self.ConfLogLevel_deprecated] self.logger.warning("Using value %s for property %s (value taken from its deprecated equivalent property %s).", self.properties[self.ConfLogLevel], self.ConfLogLevel, self.ConfLogLevel_deprecated) # Set proper logging level log_level = getattr(logging, self.properties['seal.seqal.log.level'], None) if log_level is None: self.logger.setLevel(logging.DEBUG) self.logger.warning("Invalid configuration value '%s' for %s. Check your configuration.", self.ConfLogLevel, self.properties['seal.seqal.log.level']) self.logger.warning("Falling back to DEBUG") self.logger.warning("Valid values for seal.seqal.log.level are: DEBUG, INFO, WARNING, ERROR, CRITICAL; default: %s", SeqalSubmit.DefaultLogLevel) else: self.logger.setLevel(log_level) # reference self.properties['mapred.cache.archives'] = '%s#reference' % self.options.reference # set the number of reduce tasks if self.options.align_only: n_red_tasks = 0 if self.options.num_reducers and self.options.num_reducers > 0: self.logger.warning("Number of reduce tasks must be 0 when doing --align-only.") self.logger.warning("Ignoring request for %d reduce tasks", self.options.num_reducers) elif self.options.num_reducers: n_red_tasks = self.options.num_reducers else: n_red_tasks = SeqalSubmit.DefaultReduceTasksPerNode * hadut.get_num_nodes() self.properties['mapred.reduce.tasks'] = n_red_tasks
import argparse import logging import uuid logging.basicConfig(level=logging.INFO) import pydoop import pydoop.hdfs as hdfs import pydoop.hadut as hadut import pydoop.utils as utils import pydoop.utils.conversion_tables as conv_tables from pydoop.mapreduce.pipes import PSTATS_DIR, PSTATS_FMT from .argparse_types import kv_pair, a_file_that_can_be_read from .argparse_types import a_comma_separated_list, a_hdfs_file DEFAULT_REDUCE_TASKS = max(3 * hadut.get_num_nodes(offline=True), 1) DEFAULT_ENTRY_POINT = '__main__' IS_JAVA_RR = "hadoop.pipes.java.recordreader" IS_JAVA_RW = "hadoop.pipes.java.recordwriter" CACHE_FILES = "mapred.cache.files" CACHE_ARCHIVES = "mapred.cache.archives" USER_HOME = "mapreduce.admin.user.home.dir" JOB_REDUCES = "mapred.reduce.tasks" JOB_NAME = "mapred.job.name" COMPRESS_MAP_OUTPUT = "mapred.compress.map.output" AVRO_IO_CHOICES = ['k', 'v', 'kv'] AVRO_IO_CHOICES += [_.upper() for _ in AVRO_IO_CHOICES] class PydoopSubmitter(object): """
writer = ContextWriter(ctx) %(module)s.%(combiner_fn)s(key, PydoopScriptCombiner.iter(ctx), writer) def with_conf(self, ctx): key = ctx.getInputKey() writer = ContextWriter(ctx) %(module)s.%(combiner_fn)s(key, PydoopScriptReducer.iter(ctx), writer, self.conf) if __name__ == '__main__': result = pydoop.pipes.runTask(pydoop.pipes.Factory( PydoopScriptMapper, PydoopScriptReducer, record_reader_class=None, record_writer_class=None, combiner_class=%(combiner_wp)s, partitioner_class=None)) sys.exit(0 if result else 1) """ DEFAULT_REDUCE_TASKS = max(3 * hadut.get_num_nodes(offline=True), 1) DEFAULT_OUTPUT_FORMAT = "org.apache.hadoop.mapred.TextOutputFormat" NOSEP_OUTPUT_FORMAT = 'it.crs4.pydoop.NoSeparatorTextOutputFormat' def kv_pair(s): return s.split("=", 1) class PydoopScript(object): DESCRIPTION = "Easy MapReduce scripting with Pydoop" def __init__(self): self.logger = logging.getLogger("PydoopScript") self.properties = {
def parse_cmd_line(self, args): self.options, self.left_over_args = self.parser.load_config_and_cmd_line( args) # set the job name. Do it here so the user can override it self.properties[ 'mapred.job.name'] = 'seqal_aln_%s' % self.options.output # now collect the property values specified in the options and # copy them to properties for k, v in self.options.properties.iteritems(): self.properties[k] = v # create a logger logging.basicConfig() self.logger = logging.getLogger(self.__class__.LogName) # temporarily set to a high logging level in case we have to print warnings # regarding deprecated properties self.logger.setLevel(logging.DEBUG) # warn for deprecated bl.seqal.log.level property if self.properties.has_key(self.ConfLogLevel_deprecated): deprecation.deprecation_warning(self.logger, self.ConfLogLevel_deprecated, self.ConfLogLevel) if self.properties[self.ConfLogLevel] == self.DefaultLogLevel and \ self.properties[self.ConfLogLevel_deprecated] != self.DefaultLogLevel: # the deprecated property is different from default, while the new property is not. Therefore, # the user has set the deprecated property to a new value. We'll use that one. self.properties[self.ConfLogLevel] = self.properties[ self.ConfLogLevel_deprecated] self.logger.warning( "Using value %s for property %s (value taken from its deprecated equivalent property %s).", self.properties[self.ConfLogLevel], self.ConfLogLevel, self.ConfLogLevel_deprecated) # Set proper logging level log_level = getattr(logging, self.properties['seal.seqal.log.level'], None) if log_level is None: self.logger.setLevel(logging.DEBUG) self.logger.warning( "Invalid configuration value '%s' for %s. Check your configuration.", self.ConfLogLevel, self.properties['seal.seqal.log.level']) self.logger.warning("Falling back to DEBUG") self.logger.warning( "Valid values for seal.seqal.log.level are: DEBUG, INFO, WARNING, ERROR, CRITICAL; default: %s", SeqalSubmit.DefaultLogLevel) else: self.logger.setLevel(log_level) # reference self.properties[ 'mapred.cache.archives'] = '%s#reference' % self.options.reference # set the number of reduce tasks if self.options.align_only: n_red_tasks = 0 if self.options.num_reducers and self.options.num_reducers > 0: self.logger.warning( "Number of reduce tasks must be 0 when doing --align-only." ) self.logger.warning("Ignoring request for %d reduce tasks", self.options.num_reducers) elif self.options.num_reducers: n_red_tasks = self.options.num_reducers else: n_red_tasks = SeqalSubmit.DefaultReduceTasksPerNode * hadut.get_num_nodes( ) self.properties['mapred.reduce.tasks'] = n_red_tasks