class transform(object): def __init__(self): self.config = Config() self.table = self.config.dataset() self.projections = self.config.projections() def __call__(self, environ, start_response): project_dir = environ["zohmg_project_dir"] url_parts = environ["PATH_INFO"][1:-1].split( "/") # trim pre- and appending / print "[%s] Transform, serving from %s." % (time.asctime(), project_dir) if len(url_parts) > 1: start_response("404 Not Found", [("Content-type", "text/html")]) return "Too many levels in path: %s." % environ["PATH_INFO"] else: # import user transformer. sys.path.append( project_dir) # add cwd so we can import from there. usertransformer = __import__("transformers/" + url_parts[0]) transform = usertransformer.transform payload = data_utils.hbase_get(self.table, self.projections, environ) if payload: start_response("200 OK", [("Content-type", "text/html")]) return data_utils.dump_jsonp(transform(payload)) else: start_response("404 Not Found", [("Content-type", "text/html")]) return "Bad query or no data found."
class transform(object): def __init__(self): self.config = Config() self.table = self.config.dataset() self.projections = self.config.projections() def __call__(self,environ,start_response): project_dir = environ["zohmg_project_dir"] url_parts = environ["PATH_INFO"][1:-1].split("/") # trim pre- and appending / print "[%s] Transform, serving from %s." % (time.asctime(),project_dir) if len(url_parts) > 1: start_response("404 Not Found",[("Content-type","text/html")]) return "Too many levels in path: %s." % environ["PATH_INFO"] else: # import user transformer. sys.path.append(project_dir) # add cwd so we can import from there. usertransformer = __import__("transformers/"+url_parts[0]) transform = usertransformer.transform payload = data_utils.hbase_get(self.table,self.projections,environ) if payload: start_response("200 OK",[("Content-type","text/html")]) return data_utils.dump_jsonp(transform(payload)) else: start_response("404 Not Found",[("Content-type","text/html")]) return "Bad query or no data found."
def test_sanity_check(self): # a few broken configurations, for x in ['a','b', 'c']: dataset = 'tests/fixtures/dataset-broken-%s.yaml' % x self.assertRaises(SystemExit, Config, dataset) # and a good one. dataset = 'tests/fixtures/dataset-ok.yaml' c = Config(dataset) self.assertEqual(c.sanity_check(), True)
def test_sanity_check(self): # a few broken configurations, for x in ['a', 'b', 'c']: dataset = 'tests/fixtures/dataset-broken-%s.yaml' % x self.assertRaises(SystemExit, Config, dataset) # and a good one. dataset = 'tests/fixtures/dataset-ok.yaml' c = Config(dataset) self.assertEqual(c.sanity_check(), True)
def __init__(self, dataset=None, projections=None): if dataset == None or projections == None: config = Config() if dataset == None: self.table = config.dataset() else: self.table = dataset if projections == None: self.projections = config.projections() else: self.projections = projections
def please(self): host = 'localhost' table = Config().dataset() # confirm. print "reset will *wipe all data* in dataset '%s'." % table print "ARE YOU QUITE SURE? ('yes' to confirm.)" try: response = sys.stdin.readline().strip() if response.lower() not in ["yes", "yes!"]: print 'phew!' sys.exit(0) except KeyboardInterrupt: print 'hyorgh!' sys.exit(0) # disable+drop. try: print "ok, wiping!" ZohmgHBase.delete_table(table) # recreate. print print "recreating." Setup().go() except Exception, e: print 'reset failed :-(' print 'error: ' + str(e) sys.exit(1)
def go(self): dataset = Config().config['dataset'] column_family = ["unit"] print "creating table '%s'" % dataset ZohmgHBase.create_table(dataset, column_family) print 'ok.'
class Combiner(object): def __init__(self): self.config = Config() def __call__(self, key, values): # currently only supports average and sum timestamp, projection, dimensions, unit = key if self.config.aggregations()[unit] == 'average': total = 0.0 num = 0 for item in values: total += item num += 1 value = total / num else: value = sum(values) yield key, value
class Reducer(object): def __init__(self): self.config = Config() def __call__(self, key, values): timestamp, projection, dimensions, unit = key # currently only supports average and sum if self.config.aggregations()[unit] == "average": total = 0.0 num = 0 for item in values: total += item num += 1 value = total / num if value == 0: return else: value = sum(values) # encode dimensions and their attributes in the rowkey. # (it's important that we get the ordering right.) rowkeyarray = [] for d in projection: rowkeyarray.append(d) rowkeyarray.append(dimensions[d]) rowkeyarray.append(str(timestamp)) rowkey = "-".join(rowkeyarray) # rowkey => 'artist-97930-track-102203-20090601' columnfamily = "unit:" cfq = columnfamily + unit # cfq => 'unit:scrobbles' json_payload = json.dumps({cfq: {"value": value}}) # json_payload => '{"unit:scrobbles": {"value": 1338}}' yield rowkey, json_payload
def __init__(self): self.config = Config() self.table = self.config.dataset() self.projections = self.config.projections()
def __init__(self, usermapper, projections=None): self.usermapper = usermapper if projections == None: projections = Config().projections() self.projections = projections
def go(self, mapper, input, for_dumbo): local_mode = False # default: run jobs on Hadoop. local_output_path = '/tmp/zohmg-output' # TODO: make user configurable. table = Config().dataset() jobname = "%s %s" % (table, input ) # overrides any name specified on cli. resolver = 'fm.last.darling.hbase.HBaseIdentifierResolver' outputformat = 'org.apache.hadoop.hbase.mapreduce.TableOutputFormat' opts = [ ('jobconf', "hbase.mapred.outputtable=" + table), ('jobconf', 'stream.io.identifier.resolver.class=' + resolver), ('streamoutput', 'hbase'), # resolved by identifier.resolver ('outputformat', outputformat), ('input', input), ('file', 'lib/usermapper.py'), # TODO: handle this more betterer. ('name', jobname) ] # add zohmg-*.egg zohmg_egg = [z for z in sys.path if "zohmg" in z][0] opts.append(('libegg', zohmg_egg)) # add files to the jobjar from these paths jar_path = '/usr/local/lib/zohmg/jar' egg_path = '/usr/local/lib/zohmg/egg' directories = ["config", "lib", jar_path, egg_path] file_opts = self.__add_files(directories) opts.extend(file_opts) ## check extra arguments. # TODO: allow for any order of extra elements. # as it stands, --local must be specified before --lzo. # first, check for '--local' if len(for_dumbo) > 0 and for_dumbo[0] == '--local': local_mode = True for_dumbo.pop(0) # remove '--local'. # check for '--lzo' as first extra argument. if len(for_dumbo) > 0 and for_dumbo[0] == '--lzo': print 'lzo mode: enabled.' opts.append( ('inputformat', 'org.apache.hadoop.mapred.LzoTextInputFormat')) for_dumbo.pop(0) # remove '--lzo'. env = Environ() if local_mode: print 'local mode: enabled.' opts.append(('output', local_output_path)) else: print 'hadoop mode: enabled.' hadoop_home = env.get("HADOOP_HOME") if not os.path.isdir(hadoop_home): msg = "error: HADOOP_HOME in config/environment.py is not a directory." fail(msg) opts.append(('output', '/tmp/does-not-matter')) opts.append(('hadoop', hadoop_home)) # add jars defined in config/environment.py to jobjar. classpath = env.get("CLASSPATH") if classpath is not None: for jar in classpath: if not os.path.isfile(jar): msg = "error: jar defined in config/environment is not a file: %s." % jar fail(msg) else: print 'import: adding %s to jobjar.' % jar opts.append(('libjar', jar)) else: msg = "error: CLASSPATH in config/environment is empty." fail(msg) # stringify arguments. opts_args = ' '.join("-%s '%s'" % (k, v) for (k, v) in opts) more_args = ' '.join(for_dumbo) # TODO: is this necessary? dumboargs = "%s %s" % (opts_args, more_args) print "giving dumbo these args: " + dumboargs # link-magic for usermapper. usermapper = os.path.abspath(".") + "/lib/usermapper.py" if os.path.isfile(usermapper): # TODO: need to be *very* certain we're not unlinking the wrong file. os.unlink(usermapper) # TODO: SECURITY, need to be certain that we symlink correct file. # TODO: borks if lib directory does not exist. os.symlink(mapper, usermapper) # let the user know what will happen. if local_mode: print 'doing local run.' print 'data will not be imported to hbase.' print 'output is at ' + local_output_path # dispatch. # PYTHONPATH is added because dumbo makes a local run before # engaging with hadoop. os.system( "PYTHONPATH=lib dumbo start /usr/local/lib/zohmg/mapred/import.py " + dumboargs)
def __init__(self): self.config = Config()