コード例 #1
0
class transform(object):
    def __init__(self):
        self.config = Config()
        self.table = self.config.dataset()
        self.projections = self.config.projections()

    def __call__(self, environ, start_response):
        project_dir = environ["zohmg_project_dir"]
        url_parts = environ["PATH_INFO"][1:-1].split(
            "/")  # trim pre- and appending /

        print "[%s] Transform, serving from %s." % (time.asctime(),
                                                    project_dir)

        if len(url_parts) > 1:
            start_response("404 Not Found", [("Content-type", "text/html")])
            return "Too many levels in path: %s." % environ["PATH_INFO"]
        else:
            # import user transformer.
            sys.path.append(
                project_dir)  # add cwd so we can import from there.
            usertransformer = __import__("transformers/" + url_parts[0])
            transform = usertransformer.transform

        payload = data_utils.hbase_get(self.table, self.projections, environ)
        if payload:
            start_response("200 OK", [("Content-type", "text/html")])
            return data_utils.dump_jsonp(transform(payload))
        else:
            start_response("404 Not Found", [("Content-type", "text/html")])
            return "Bad query or no data found."
コード例 #2
0
ファイル: transform.py プロジェクト: avtobiff/zohmg
class transform(object):
    def __init__(self):
        self.config = Config()
        self.table = self.config.dataset()
        self.projections = self.config.projections()

    def __call__(self,environ,start_response):
        project_dir = environ["zohmg_project_dir"]
        url_parts = environ["PATH_INFO"][1:-1].split("/") # trim pre- and appending /

        print "[%s] Transform, serving from %s." % (time.asctime(),project_dir)

        if len(url_parts) > 1:
            start_response("404 Not Found",[("Content-type","text/html")])
            return "Too many levels in path: %s." % environ["PATH_INFO"]
        else:
            # import user transformer.
            sys.path.append(project_dir) # add cwd so we can import from there.
            usertransformer = __import__("transformers/"+url_parts[0])
            transform = usertransformer.transform

        payload = data_utils.hbase_get(self.table,self.projections,environ)
        if payload:
            start_response("200 OK",[("Content-type","text/html")])
            return data_utils.dump_jsonp(transform(payload))
        else:
            start_response("404 Not Found",[("Content-type","text/html")])
            return "Bad query or no data found."
コード例 #3
0
ファイル: test_config.py プロジェクト: avtobiff/zohmg
 def test_sanity_check(self):
     # a few broken configurations,
     for x in ['a','b', 'c']:
         dataset = 'tests/fixtures/dataset-broken-%s.yaml' % x
         self.assertRaises(SystemExit, Config, dataset)
     # and a good one.
     dataset = 'tests/fixtures/dataset-ok.yaml'
     c = Config(dataset)
     self.assertEqual(c.sanity_check(), True)
コード例 #4
0
 def test_sanity_check(self):
     # a few broken configurations,
     for x in ['a', 'b', 'c']:
         dataset = 'tests/fixtures/dataset-broken-%s.yaml' % x
         self.assertRaises(SystemExit, Config, dataset)
     # and a good one.
     dataset = 'tests/fixtures/dataset-ok.yaml'
     c = Config(dataset)
     self.assertEqual(c.sanity_check(), True)
コード例 #5
0
    def __init__(self, dataset=None, projections=None):
        if dataset == None or projections == None:
            config = Config()

        if dataset == None:
            self.table = config.dataset()
        else:
            self.table = dataset

        if projections == None:
            self.projections = config.projections()
        else:
            self.projections = projections
コード例 #6
0
ファイル: reset.py プロジェクト: charles-cai/zohmg
    def please(self):
        host = 'localhost'
        table = Config().dataset()

        # confirm.
        print "reset will *wipe all data* in dataset '%s'." % table
        print "ARE YOU QUITE SURE? ('yes' to confirm.)"

        try:
            response = sys.stdin.readline().strip()
            if response.lower() not in ["yes", "yes!"]:
                print 'phew!'
                sys.exit(0)
        except KeyboardInterrupt:
            print 'hyorgh!'
            sys.exit(0)

        # disable+drop.
        try:
            print "ok, wiping!"
            ZohmgHBase.delete_table(table)
            # recreate.
            print
            print "recreating."
            Setup().go()
        except Exception, e:
            print 'reset failed :-('
            print 'error: ' + str(e)
            sys.exit(1)
コード例 #7
0
    def go(self):
        dataset = Config().config['dataset']
        column_family = ["unit"]

        print "creating table '%s'" % dataset
        ZohmgHBase.create_table(dataset, column_family)
        print 'ok.'
コード例 #8
0
ファイル: combiner.py プロジェクト: mbulat/zohmg
class Combiner(object):
    def __init__(self):
        self.config = Config()

    def __call__(self, key, values):
        # currently only supports average and sum
        timestamp, projection, dimensions, unit = key
        if self.config.aggregations()[unit] == 'average':
            total = 0.0
            num = 0
            for item in values:
                total += item
                num += 1
            value = total / num
        else:
            value = sum(values)

        yield key, value
コード例 #9
0
ファイル: reducer.py プロジェクト: tidewinds/zohmg
class Reducer(object):
    def __init__(self):
        self.config = Config()

    def __call__(self, key, values):
        timestamp, projection, dimensions, unit = key
        # currently only supports average and sum
        if self.config.aggregations()[unit] == "average":
            total = 0.0
            num = 0
            for item in values:
                total += item
                num += 1
            value = total / num

            if value == 0:
                return
        else:
            value = sum(values)

        # encode dimensions and their attributes in the rowkey.
        # (it's important that we get the ordering right.)
        rowkeyarray = []
        for d in projection:
            rowkeyarray.append(d)
            rowkeyarray.append(dimensions[d])
        rowkeyarray.append(str(timestamp))
        rowkey = "-".join(rowkeyarray)
        # rowkey => 'artist-97930-track-102203-20090601'

        columnfamily = "unit:"
        cfq = columnfamily + unit
        # cfq => 'unit:scrobbles'

        json_payload = json.dumps({cfq: {"value": value}})
        # json_payload => '{"unit:scrobbles": {"value": 1338}}'

        yield rowkey, json_payload
コード例 #10
0
 def __init__(self):
     self.config = Config()
     self.table = self.config.dataset()
     self.projections = self.config.projections()
コード例 #11
0
ファイル: transform.py プロジェクト: avtobiff/zohmg
 def __init__(self):
     self.config = Config()
     self.table = self.config.dataset()
     self.projections = self.config.projections()
コード例 #12
0
ファイル: mapper.py プロジェクト: charles-cai/zohmg
 def __init__(self, usermapper, projections=None):
     self.usermapper = usermapper
     if projections == None:
         projections = Config().projections()
     self.projections = projections
コード例 #13
0
    def go(self, mapper, input, for_dumbo):
        local_mode = False  # default: run jobs on Hadoop.
        local_output_path = '/tmp/zohmg-output'  # TODO: make user configurable.

        table = Config().dataset()
        jobname = "%s %s" % (table, input
                             )  # overrides any name specified on cli.

        resolver = 'fm.last.darling.hbase.HBaseIdentifierResolver'
        outputformat = 'org.apache.hadoop.hbase.mapreduce.TableOutputFormat'

        opts = [
            ('jobconf', "hbase.mapred.outputtable=" + table),
            ('jobconf', 'stream.io.identifier.resolver.class=' + resolver),
            ('streamoutput', 'hbase'),  # resolved by identifier.resolver
            ('outputformat', outputformat),
            ('input', input),
            ('file', 'lib/usermapper.py'),  # TODO: handle this more betterer.
            ('name', jobname)
        ]

        # add zohmg-*.egg
        zohmg_egg = [z for z in sys.path if "zohmg" in z][0]
        opts.append(('libegg', zohmg_egg))

        # add files to the jobjar from these paths
        jar_path = '/usr/local/lib/zohmg/jar'
        egg_path = '/usr/local/lib/zohmg/egg'
        directories = ["config", "lib", jar_path, egg_path]
        file_opts = self.__add_files(directories)
        opts.extend(file_opts)

        ## check extra arguments.
        # TODO: allow for any order of extra elements.
        #       as it stands, --local must be specified before --lzo.
        # first, check for '--local'
        if len(for_dumbo) > 0 and for_dumbo[0] == '--local':
            local_mode = True
            for_dumbo.pop(0)  # remove '--local'.
        # check for '--lzo' as first extra argument.
        if len(for_dumbo) > 0 and for_dumbo[0] == '--lzo':
            print 'lzo mode: enabled.'
            opts.append(
                ('inputformat', 'org.apache.hadoop.mapred.LzoTextInputFormat'))
            for_dumbo.pop(0)  # remove '--lzo'.

        env = Environ()

        if local_mode:
            print 'local mode: enabled.'
            opts.append(('output', local_output_path))
        else:
            print 'hadoop mode: enabled.'
            hadoop_home = env.get("HADOOP_HOME")
            if not os.path.isdir(hadoop_home):
                msg = "error: HADOOP_HOME in config/environment.py is not a directory."
                fail(msg)
            opts.append(('output', '/tmp/does-not-matter'))
            opts.append(('hadoop', hadoop_home))

        # add jars defined in config/environment.py to jobjar.
        classpath = env.get("CLASSPATH")
        if classpath is not None:
            for jar in classpath:
                if not os.path.isfile(jar):
                    msg = "error: jar defined in config/environment is not a file: %s." % jar
                    fail(msg)
                else:
                    print 'import: adding %s to jobjar.' % jar
                    opts.append(('libjar', jar))
        else:
            msg = "error: CLASSPATH in config/environment is empty."
            fail(msg)

        # stringify arguments.
        opts_args = ' '.join("-%s '%s'" % (k, v) for (k, v) in opts)
        more_args = ' '.join(for_dumbo)  # TODO: is this necessary?
        dumboargs = "%s %s" % (opts_args, more_args)
        print "giving dumbo these args: " + dumboargs

        # link-magic for usermapper.
        usermapper = os.path.abspath(".") + "/lib/usermapper.py"
        if os.path.isfile(usermapper):
            # TODO: need to be *very* certain we're not unlinking the wrong file.
            os.unlink(usermapper)
        # TODO: SECURITY, need to be certain that we symlink correct file.
        # TODO: borks if lib directory does not exist.
        os.symlink(mapper, usermapper)

        # let the user know what will happen.
        if local_mode:
            print 'doing local run.'
            print 'data will not be imported to hbase.'
            print 'output is at ' + local_output_path

        # dispatch.
        # PYTHONPATH is added because dumbo makes a local run before
        # engaging with hadoop.
        os.system(
            "PYTHONPATH=lib dumbo start /usr/local/lib/zohmg/mapred/import.py "
            + dumboargs)
コード例 #14
0
ファイル: reducer.py プロジェクト: tidewinds/zohmg
 def __init__(self):
     self.config = Config()
コード例 #15
0
ファイル: reducer.py プロジェクト: charles-cai/zohmg
 def __init__(self):
     self.config = Config()