def __add_files(self, dirs): opts = [] # TODO: optimize? this is now O(dirs*entries*files). for dir in dirs: for entry in os.walk(dir): dir, dirnames, files = entry # for each file add it with correct option. for file in files: if not os.path.isfile(dir + "/" + file): msg = "error: File not found, %s." % file fail(msg) suffix = file.split(".")[-1] option = None if suffix == "egg": option = "libegg" elif suffix == "jar": option = "libjar" elif suffix == "py": option = "file" elif suffix == "yaml": option = "file" if option: print 'import: adding %s to jobjar.' % file opts.append((option, dir + "/" + file)) else: print "import: ignoring " + dir + '/' + file return opts
def __add_files(self,dirs): opts = [] # TODO: optimize? this is now O(dirs*entries*files). for dir in dirs: for entry in os.walk(dir): dir,dirnames,files = entry # for each file add it with correct option. for file in files: if not os.path.isfile(dir+"/"+file): msg = "error: File not found, %s." % file fail(msg) suffix = file.split(".")[-1] option = None if suffix == "egg": option = "libegg" elif suffix == "jar": option = "libjar" elif suffix == "py": option = "file" elif suffix == "yaml": option = "file" if option: print 'import: adding %s to jobjar.' % file opts.append((option, dir+"/"+file)) else: print "import: ignoring " + dir+'/'+file return opts
def __init__(self, path): self.basename = os.path.basename(path) self.abspath = os.path.abspath(path) try: shutil.copytree('/usr/local/share/zohmg/skel-project', self.abspath) # reset access and mod times. os.system('cd %s; touch *; touch **/*' % self.abspath) except OSError, ose: # something went wrong. act accordingly. msg = "error: could not create project directory - %s" % ose.strerror fail(msg, ose.errno)
def read_environ(self): # add config path so we can import from it. sys.path.append(".") sys.path.append("config") try: import environment except ImportError: msg = "[%s] Error: Could not import environment.py" % time.asctime() fail(msg) for key in dir(environment): self.environ[key] = environment.__dict__[key]
def read_environ(self): # add config path so we can import from it. sys.path.append(".") sys.path.append("config") try: import environment except ImportError: msg = "[%s] Error: Could not import environment.py" % time.asctime( ) fail(msg) for key in dir(environment): self.environ[key] = environment.__dict__[key]
def go(self, mapper, input, for_dumbo): local_mode = False # default: run jobs on Hadoop. local_output_path = '/tmp/zohmg-output' # TODO: make user configurable. table = Config().dataset() jobname = "%s %s" % (table, input ) # overrides any name specified on cli. resolver = 'fm.last.darling.hbase.HBaseIdentifierResolver' outputformat = 'org.apache.hadoop.hbase.mapreduce.TableOutputFormat' opts = [ ('jobconf', "hbase.mapred.outputtable=" + table), ('jobconf', 'stream.io.identifier.resolver.class=' + resolver), ('streamoutput', 'hbase'), # resolved by identifier.resolver ('outputformat', outputformat), ('input', input), ('file', 'lib/usermapper.py'), # TODO: handle this more betterer. ('name', jobname) ] # add zohmg-*.egg zohmg_egg = [z for z in sys.path if "zohmg" in z][0] opts.append(('libegg', zohmg_egg)) # add files to the jobjar from these paths jar_path = '/usr/local/lib/zohmg/jar' egg_path = '/usr/local/lib/zohmg/egg' directories = ["config", "lib", jar_path, egg_path] file_opts = self.__add_files(directories) opts.extend(file_opts) ## check extra arguments. # TODO: allow for any order of extra elements. # as it stands, --local must be specified before --lzo. # first, check for '--local' if len(for_dumbo) > 0 and for_dumbo[0] == '--local': local_mode = True for_dumbo.pop(0) # remove '--local'. # check for '--lzo' as first extra argument. if len(for_dumbo) > 0 and for_dumbo[0] == '--lzo': print 'lzo mode: enabled.' opts.append( ('inputformat', 'org.apache.hadoop.mapred.LzoTextInputFormat')) for_dumbo.pop(0) # remove '--lzo'. env = Environ() if local_mode: print 'local mode: enabled.' opts.append(('output', local_output_path)) else: print 'hadoop mode: enabled.' hadoop_home = env.get("HADOOP_HOME") if not os.path.isdir(hadoop_home): msg = "error: HADOOP_HOME in config/environment.py is not a directory." fail(msg) opts.append(('output', '/tmp/does-not-matter')) opts.append(('hadoop', hadoop_home)) # add jars defined in config/environment.py to jobjar. classpath = env.get("CLASSPATH") if classpath is not None: for jar in classpath: if not os.path.isfile(jar): msg = "error: jar defined in config/environment is not a file: %s." % jar fail(msg) else: print 'import: adding %s to jobjar.' % jar opts.append(('libjar', jar)) else: msg = "error: CLASSPATH in config/environment is empty." fail(msg) # stringify arguments. opts_args = ' '.join("-%s '%s'" % (k, v) for (k, v) in opts) more_args = ' '.join(for_dumbo) # TODO: is this necessary? dumboargs = "%s %s" % (opts_args, more_args) print "giving dumbo these args: " + dumboargs # link-magic for usermapper. usermapper = os.path.abspath(".") + "/lib/usermapper.py" if os.path.isfile(usermapper): # TODO: need to be *very* certain we're not unlinking the wrong file. os.unlink(usermapper) # TODO: SECURITY, need to be certain that we symlink correct file. # TODO: borks if lib directory does not exist. os.symlink(mapper, usermapper) # let the user know what will happen. if local_mode: print 'doing local run.' print 'data will not be imported to hbase.' print 'output is at ' + local_output_path # dispatch. # PYTHONPATH is added because dumbo makes a local run before # engaging with hadoop. os.system( "PYTHONPATH=lib dumbo start /usr/local/lib/zohmg/mapred/import.py " + dumboargs)
# ok, good! f.close() config_loaded = True if not config_loaded: # condition A. sys.stderr.write("Configuration error: Could not read dataset configuration " \ "from any of these files:\n" \ "\n".join(possible_configs) + "\n") raise ConfigNotLoaded("Could not read configuration file.") # check contents. if not self.sanity_check(): msg = "[%s] Configuration error: Could not parse configuration from %s." % (time.asctime(), file_loaded) fail(msg) # TODO: should maybe not use fail as it raises SystemExit. return self.config def dataset(self): return self.config['dataset'] def dimensions(self): return self.config['dimensions'] def units(self): return self.config['units'] def projections(self): # turn list of strings into list of list of strings. # ['country', 'country-domain-useragent-usertype'] # => [['country'], ['country', 'domain', 'useragent', 'usertype']] return map(lambda s : s.split('-'), self.config['projections'])
def refuse_to_act_in_nonzohmg_directory(): cwd = os.getcwd() if not os.path.exists(cwd + "/.zohmg"): msg = "error: This is not a proper zohmg project." fail(msg)
def go(self, mapper, input, for_dumbo): local_mode = False # default: run jobs on Hadoop. local_output_path = '/tmp/zohmg-output' # TODO: make user configurable. table = Config().dataset() jobname = "%s %s" % (table, input) # overrides any name specified on cli. resolver = 'fm.last.darling.hbase.HBaseIdentifierResolver' outputformat = 'org.apache.hadoop.hbase.mapreduce.TableOutputFormat' opts = [('jobconf', "hbase.mapred.outputtable=" + table), ('jobconf', 'stream.io.identifier.resolver.class=' + resolver), ('streamoutput', 'hbase'), # resolved by identifier.resolver ('outputformat', outputformat), ('input', input), ('file', 'lib/usermapper.py'), # TODO: handle this more betterer. ('name', jobname) ] # add zohmg-*.egg zohmg_egg = [z for z in sys.path if "zohmg" in z][0] opts.append(('libegg', zohmg_egg)) # add files to the jobjar from these paths jar_path = '/usr/local/lib/zohmg/jar' egg_path = '/usr/local/lib/zohmg/egg' directories = ["config", "lib", jar_path, egg_path] file_opts = self.__add_files(directories) opts.extend(file_opts) ## check extra arguments. # TODO: allow for any order of extra elements. # as it stands, --local must be specified before --lzo. # first, check for '--local' if len(for_dumbo) > 0 and for_dumbo[0] == '--local': local_mode = True for_dumbo.pop(0) # remove '--local'. # check for '--lzo' as first extra argument. if len(for_dumbo) > 0 and for_dumbo[0] == '--lzo': print 'lzo mode: enabled.' opts.append(('inputformat', 'org.apache.hadoop.mapred.LzoTextInputFormat')) for_dumbo.pop(0) # remove '--lzo'. env = Environ() if local_mode: print 'local mode: enabled.' opts.append(('output', local_output_path)) else: print 'hadoop mode: enabled.' hadoop_home = env.get("HADOOP_HOME") if not os.path.isdir(hadoop_home): msg = "error: HADOOP_HOME in config/environment.py is not a directory." fail(msg) opts.append(('output', '/tmp/does-not-matter')) opts.append(('hadoop', hadoop_home)) # add jars defined in config/environment.py to jobjar. classpath = env.get("CLASSPATH") if classpath is not None: for jar in classpath: if not os.path.isfile(jar): msg = "error: jar defined in config/environment is not a file: %s." % jar fail(msg) else: print 'import: adding %s to jobjar.' % jar opts.append(('libjar', jar)) else: msg = "error: CLASSPATH in config/environment is empty." fail(msg) # stringify arguments. opts_args = ' '.join("-%s '%s'" % (k, v) for (k, v) in opts) more_args = ' '.join(for_dumbo) # TODO: is this necessary? dumboargs = "%s %s" % (opts_args, more_args) print "giving dumbo these args: " + dumboargs # link-magic for usermapper. usermapper = os.path.abspath(".") + "/lib/usermapper.py" if os.path.isfile(usermapper): # TODO: need to be *very* certain we're not unlinking the wrong file. os.unlink(usermapper) # TODO: SECURITY, need to be certain that we symlink correct file. # TODO: borks if lib directory does not exist. os.symlink(mapper, usermapper) # let the user know what will happen. if local_mode: print 'doing local run.' print 'data will not be imported to hbase.' print 'output is at ' + local_output_path # dispatch. # PYTHONPATH is added because dumbo makes a local run before # engaging with hadoop. os.system("PYTHONPATH=lib dumbo start /usr/local/lib/zohmg/mapred/import.py " + dumboargs)
# ok, good! f.close() config_loaded = True if not config_loaded: # condition A. sys.stderr.write("Configuration error: Could not read dataset configuration " \ "from any of these files:\n" \ "\n".join(possible_configs) + "\n") raise ConfigNotLoaded("Could not read configuration file.") # check contents. if not self.sanity_check(): msg = "[%s] Configuration error: Could not parse configuration from %s." % ( time.asctime(), file_loaded) fail(msg ) # TODO: should maybe not use fail as it raises SystemExit. return self.config def dataset(self): return self.config['dataset'] def dimensions(self): return self.config['dimensions'] def units(self): return self.config['units'] def projections(self): # turn list of strings into list of list of strings. # ['country', 'country-domain-useragent-usertype']
def refuse_to_act_in_nonzohmg_directory(): cwd = os.getcwd() if not os.path.exists(cwd+"/.zohmg"): msg = "error: This is not a proper zohmg project." fail(msg)