Beispiel #1
0
    def __add_files(self, dirs):
        opts = []
        # TODO: optimize? this is now O(dirs*entries*files).
        for dir in dirs:
            for entry in os.walk(dir):
                dir, dirnames, files = entry
                # for each file add it with correct option.
                for file in files:
                    if not os.path.isfile(dir + "/" + file):
                        msg = "error: File not found, %s." % file
                        fail(msg)

                    suffix = file.split(".")[-1]
                    option = None
                    if suffix == "egg": option = "libegg"
                    elif suffix == "jar": option = "libjar"
                    elif suffix == "py": option = "file"
                    elif suffix == "yaml": option = "file"

                    if option:
                        print 'import: adding %s to jobjar.' % file
                        opts.append((option, dir + "/" + file))
                    else:
                        print "import: ignoring " + dir + '/' + file
        return opts
Beispiel #2
0
    def __add_files(self,dirs):
        opts = []
        # TODO: optimize? this is now O(dirs*entries*files).
        for dir in dirs:
            for entry in os.walk(dir):
                dir,dirnames,files = entry
                # for each file add it with correct option.
                for file in files:
                    if not os.path.isfile(dir+"/"+file):
                        msg = "error: File not found, %s." % file
                        fail(msg)

                    suffix = file.split(".")[-1]
                    option = None
                    if   suffix == "egg":  option = "libegg"
                    elif suffix == "jar":  option = "libjar"
                    elif suffix == "py":   option = "file"
                    elif suffix == "yaml": option = "file"

                    if option:
                        print 'import: adding %s to jobjar.' % file
                        opts.append((option, dir+"/"+file))
                    else:
                        print "import: ignoring " + dir+'/'+file
        return opts
Beispiel #3
0
    def __init__(self, path):
        self.basename = os.path.basename(path)
        self.abspath  = os.path.abspath(path)

        try:
            shutil.copytree('/usr/local/share/zohmg/skel-project', self.abspath)
            # reset access and mod times.
            os.system('cd %s; touch *; touch **/*' % self.abspath)
        except OSError, ose:
            # something went wrong. act accordingly.
            msg = "error: could not create project directory - %s" % ose.strerror
            fail(msg, ose.errno)
Beispiel #4
0
    def read_environ(self):
        # add config path so we can import from it.
        sys.path.append(".")
        sys.path.append("config")

        try:
            import environment
        except ImportError:
            msg = "[%s] Error: Could not import environment.py" % time.asctime()
            fail(msg)

        for key in dir(environment):
            self.environ[key] = environment.__dict__[key]
Beispiel #5
0
    def read_environ(self):
        # add config path so we can import from it.
        sys.path.append(".")
        sys.path.append("config")

        try:
            import environment
        except ImportError:
            msg = "[%s] Error: Could not import environment.py" % time.asctime(
            )
            fail(msg)

        for key in dir(environment):
            self.environ[key] = environment.__dict__[key]
Beispiel #6
0
    def go(self, mapper, input, for_dumbo):
        local_mode = False  # default: run jobs on Hadoop.
        local_output_path = '/tmp/zohmg-output'  # TODO: make user configurable.

        table = Config().dataset()
        jobname = "%s %s" % (table, input
                             )  # overrides any name specified on cli.

        resolver = 'fm.last.darling.hbase.HBaseIdentifierResolver'
        outputformat = 'org.apache.hadoop.hbase.mapreduce.TableOutputFormat'

        opts = [
            ('jobconf', "hbase.mapred.outputtable=" + table),
            ('jobconf', 'stream.io.identifier.resolver.class=' + resolver),
            ('streamoutput', 'hbase'),  # resolved by identifier.resolver
            ('outputformat', outputformat),
            ('input', input),
            ('file', 'lib/usermapper.py'),  # TODO: handle this more betterer.
            ('name', jobname)
        ]

        # add zohmg-*.egg
        zohmg_egg = [z for z in sys.path if "zohmg" in z][0]
        opts.append(('libegg', zohmg_egg))

        # add files to the jobjar from these paths
        jar_path = '/usr/local/lib/zohmg/jar'
        egg_path = '/usr/local/lib/zohmg/egg'
        directories = ["config", "lib", jar_path, egg_path]
        file_opts = self.__add_files(directories)
        opts.extend(file_opts)

        ## check extra arguments.
        # TODO: allow for any order of extra elements.
        #       as it stands, --local must be specified before --lzo.
        # first, check for '--local'
        if len(for_dumbo) > 0 and for_dumbo[0] == '--local':
            local_mode = True
            for_dumbo.pop(0)  # remove '--local'.
        # check for '--lzo' as first extra argument.
        if len(for_dumbo) > 0 and for_dumbo[0] == '--lzo':
            print 'lzo mode: enabled.'
            opts.append(
                ('inputformat', 'org.apache.hadoop.mapred.LzoTextInputFormat'))
            for_dumbo.pop(0)  # remove '--lzo'.

        env = Environ()

        if local_mode:
            print 'local mode: enabled.'
            opts.append(('output', local_output_path))
        else:
            print 'hadoop mode: enabled.'
            hadoop_home = env.get("HADOOP_HOME")
            if not os.path.isdir(hadoop_home):
                msg = "error: HADOOP_HOME in config/environment.py is not a directory."
                fail(msg)
            opts.append(('output', '/tmp/does-not-matter'))
            opts.append(('hadoop', hadoop_home))

        # add jars defined in config/environment.py to jobjar.
        classpath = env.get("CLASSPATH")
        if classpath is not None:
            for jar in classpath:
                if not os.path.isfile(jar):
                    msg = "error: jar defined in config/environment is not a file: %s." % jar
                    fail(msg)
                else:
                    print 'import: adding %s to jobjar.' % jar
                    opts.append(('libjar', jar))
        else:
            msg = "error: CLASSPATH in config/environment is empty."
            fail(msg)

        # stringify arguments.
        opts_args = ' '.join("-%s '%s'" % (k, v) for (k, v) in opts)
        more_args = ' '.join(for_dumbo)  # TODO: is this necessary?
        dumboargs = "%s %s" % (opts_args, more_args)
        print "giving dumbo these args: " + dumboargs

        # link-magic for usermapper.
        usermapper = os.path.abspath(".") + "/lib/usermapper.py"
        if os.path.isfile(usermapper):
            # TODO: need to be *very* certain we're not unlinking the wrong file.
            os.unlink(usermapper)
        # TODO: SECURITY, need to be certain that we symlink correct file.
        # TODO: borks if lib directory does not exist.
        os.symlink(mapper, usermapper)

        # let the user know what will happen.
        if local_mode:
            print 'doing local run.'
            print 'data will not be imported to hbase.'
            print 'output is at ' + local_output_path

        # dispatch.
        # PYTHONPATH is added because dumbo makes a local run before
        # engaging with hadoop.
        os.system(
            "PYTHONPATH=lib dumbo start /usr/local/lib/zohmg/mapred/import.py "
            + dumboargs)
Beispiel #7
0
            # ok, good!
            f.close()
            config_loaded = True


        if not config_loaded:
            # condition A.
            sys.stderr.write("Configuration error: Could not read dataset configuration " \
                              "from any of these files:\n" \
                              "\n".join(possible_configs) + "\n")
            raise ConfigNotLoaded("Could not read configuration file.")

        # check contents.
        if not self.sanity_check():
            msg = "[%s] Configuration error: Could not parse configuration from %s." % (time.asctime(), file_loaded)
            fail(msg) # TODO: should maybe not use fail as it raises SystemExit.

        return self.config


    def dataset(self):
        return self.config['dataset']
    def dimensions(self):
        return self.config['dimensions']
    def units(self):
        return self.config['units']
    def projections(self):
        # turn list of strings into list of list of strings.
        # ['country', 'country-domain-useragent-usertype']
        # => [['country'], ['country', 'domain', 'useragent', 'usertype']]
        return map(lambda s : s.split('-'), self.config['projections'])
def refuse_to_act_in_nonzohmg_directory():
    cwd = os.getcwd()
    if not os.path.exists(cwd + "/.zohmg"):
        msg = "error: This is not a proper zohmg project."
        fail(msg)
Beispiel #9
0
    def go(self, mapper, input, for_dumbo):
        local_mode = False # default: run jobs on Hadoop.
        local_output_path = '/tmp/zohmg-output' # TODO: make user configurable.

        table = Config().dataset()
        jobname = "%s %s" % (table, input) # overrides any name specified on cli.

        resolver = 'fm.last.darling.hbase.HBaseIdentifierResolver'
        outputformat = 'org.apache.hadoop.hbase.mapreduce.TableOutputFormat'

        opts = [('jobconf', "hbase.mapred.outputtable=" + table),
                ('jobconf', 'stream.io.identifier.resolver.class=' + resolver),
                ('streamoutput', 'hbase'), # resolved by identifier.resolver
                ('outputformat', outputformat),
                ('input', input),
                ('file', 'lib/usermapper.py'), # TODO: handle this more betterer.
                ('name', jobname)
               ]

        # add zohmg-*.egg
        zohmg_egg = [z for z in sys.path if "zohmg" in z][0]
        opts.append(('libegg', zohmg_egg))

        # add files to the jobjar from these paths
        jar_path = '/usr/local/lib/zohmg/jar'
        egg_path = '/usr/local/lib/zohmg/egg'
        directories = ["config", "lib", jar_path, egg_path]
        file_opts = self.__add_files(directories)
        opts.extend(file_opts)

        ## check extra arguments.
        # TODO: allow for any order of extra elements.
        #       as it stands, --local must be specified before --lzo.
        # first, check for '--local'
        if len(for_dumbo) > 0 and for_dumbo[0] == '--local':
            local_mode = True
            for_dumbo.pop(0) # remove '--local'.
        # check for '--lzo' as first extra argument.
        if len(for_dumbo) > 0 and for_dumbo[0] == '--lzo':
            print 'lzo mode: enabled.'
            opts.append(('inputformat', 'org.apache.hadoop.mapred.LzoTextInputFormat'))
            for_dumbo.pop(0) # remove '--lzo'.

        env = Environ()

        if local_mode:
            print 'local mode: enabled.'
            opts.append(('output', local_output_path))
        else:
            print 'hadoop mode: enabled.'
            hadoop_home = env.get("HADOOP_HOME")
            if not os.path.isdir(hadoop_home):
                msg = "error: HADOOP_HOME in config/environment.py is not a directory."
                fail(msg)
            opts.append(('output', '/tmp/does-not-matter'))
            opts.append(('hadoop', hadoop_home))

        # add jars defined in config/environment.py to jobjar.
        classpath = env.get("CLASSPATH")
        if classpath is not None:
            for jar in classpath:
                if not os.path.isfile(jar):
                    msg = "error: jar defined in config/environment is not a file: %s." % jar
                    fail(msg)
                else:
                    print 'import: adding %s to jobjar.' % jar
                    opts.append(('libjar', jar))
        else:
            msg = "error: CLASSPATH in config/environment is empty."
            fail(msg)

        # stringify arguments.
        opts_args = ' '.join("-%s '%s'" % (k, v) for (k, v) in opts)
        more_args = ' '.join(for_dumbo) # TODO: is this necessary?
        dumboargs = "%s %s" % (opts_args, more_args)
        print "giving dumbo these args: " + dumboargs

        # link-magic for usermapper.
        usermapper = os.path.abspath(".") + "/lib/usermapper.py"
        if os.path.isfile(usermapper):
            # TODO: need to be *very* certain we're not unlinking the wrong file.
            os.unlink(usermapper)
        # TODO: SECURITY, need to be certain that we symlink correct file.
        # TODO: borks if lib directory does not exist.
        os.symlink(mapper, usermapper)

        # let the user know what will happen.
        if local_mode:
            print 'doing local run.'
            print 'data will not be imported to hbase.'
            print 'output is at ' + local_output_path

        # dispatch.
        # PYTHONPATH is added because dumbo makes a local run before
        # engaging with hadoop.
        os.system("PYTHONPATH=lib dumbo start /usr/local/lib/zohmg/mapred/import.py " + dumboargs)
Beispiel #10
0
            # ok, good!
            f.close()
            config_loaded = True

        if not config_loaded:
            # condition A.
            sys.stderr.write("Configuration error: Could not read dataset configuration " \
                              "from any of these files:\n" \
                              "\n".join(possible_configs) + "\n")
            raise ConfigNotLoaded("Could not read configuration file.")

        # check contents.
        if not self.sanity_check():
            msg = "[%s] Configuration error: Could not parse configuration from %s." % (
                time.asctime(), file_loaded)
            fail(msg
                 )  # TODO: should maybe not use fail as it raises SystemExit.

        return self.config

    def dataset(self):
        return self.config['dataset']

    def dimensions(self):
        return self.config['dimensions']

    def units(self):
        return self.config['units']

    def projections(self):
        # turn list of strings into list of list of strings.
        # ['country', 'country-domain-useragent-usertype']
Beispiel #11
0
def refuse_to_act_in_nonzohmg_directory():
    cwd = os.getcwd()
    if not os.path.exists(cwd+"/.zohmg"):
        msg = "error: This is not a proper zohmg project."
        fail(msg)