Example #1
0
 def createJob(self, p):
     self.job_machine = self.get_machine()
     use_machine(self.job_machine)
     self.job = Subshell("remote",
                         command="%s",
                         working_directory=p['workdir'],
                         identifier=p['job_identifier'])
Example #2
0
 def job_start(self, params):
     work_dir = params['working_directory']
     self.machine = self.get_machine()
     use_machine(self.machine)
     self.job = Subshell("remote", params['command'], work_dir)
     self.job.run()
     ret = self.job._ret
     if ret:
         try:
             job_id = int(ret.split('\n')[0])
         except ValueError:
             end_machine()
             raise ModuleError(self, "Error submitting job: %s" % ret)
     self.set_job_machine(params, self.machine)
     return params
Example #3
0
 def job_start(self, params):
     work_dir = params['working_directory']
     self.machine = self.get_machine()
     use_machine(self.machine)
     self.job = Subshell("remote", params['command'], work_dir)
     self.job.run()
     ret = self.job._ret
     if ret:
         try:
             job_id = int(ret.split('\n')[0])
         except ValueError:
             end_machine()
             raise ModuleError(self, "Error submitting job: %s" % ret)
     self.set_job_machine(params, self.machine)
     return params
Example #4
0
    def call_hadoop(self, arguments, workdir, identifier, machine):
        self.is_cacheable = lambda *args, **kwargs: False
        config = self.get_hadoop_config(machine)
        argList = [config['hadoop']]
        if type(arguments) in [str, unicode]:
            argList += arguments.split(' ')
        elif type(arguments) == list:
            argList += arguments
        else:
            raise ModuleError(self, 'Invalid argument types to hadoop')

        # 1. this version returns when finished
        #return subprocess.call(argList)
        # 2. this version reads the results incrementally
#         expect = machine.remote._expect_token
#         machine.remote.push_expect(None) # Do not wait for call to finish
#         result =  machine.remote.send_command(" ".join(argList)).strip()
#         machine.remote.pop_expect() # restore expect
#         # We could show the output in a gui
#         print "**** hadoop streaming running ****"
#         print result,
#         while not expect in result:
#             output = machine.remote.consume_output()
#             if output:
#                 print output,
#             result += output
# 3. The final version should detach the process on the server
        use_machine(machine)
        cdir = CreateDirectory("remote", workdir)
        job = Subshell("remote",
                       command=" ".join(argList),
                       working_directory=workdir,
                       identifier=identifier,
                       dependencies=[cdir])
        job.run()
        finished = job.finished()
        if not finished:
            status = job.status()
            # The Subshell class provides the JobHandle interface, i.e.
            # finished()
            raise ModuleSuspended(self, '%s' % status, handle=job)
        self.is_cacheable = lambda *args, **kwargs: True
        return job.standard_error()
Example #5
0
    def call_hadoop(self, arguments, workdir, identifier, machine):
        self.is_cacheable = lambda *args, **kwargs: False
        config = self.get_hadoop_config(machine)
        argList = [config['hadoop']]
        if type(arguments) in [str, unicode]:
            argList += arguments.split(' ')
        elif type(arguments)==list:
            argList += arguments
        else:
            raise ModuleError(self, 'Invalid argument types to hadoop')

        # 1. this version returns when finished
        #return subprocess.call(argList)
        # 2. this version reads the results incrementally
#         expect = machine.remote._expect_token
#         machine.remote.push_expect(None) # Do not wait for call to finish
#         result =  machine.remote.send_command(" ".join(argList)).strip()
#         machine.remote.pop_expect() # restore expect
#         # We could show the output in a gui
#         print "**** hadoop streaming running ****"
#         print result,
#         while not expect in result:
#             output = machine.remote.consume_output()
#             if output:
#                 print output,
#             result += output
        # 3. The final version should detach the process on the server
        use_machine(machine)
        cdir = CreateDirectory("remote", workdir)
        job = Subshell("remote", command=" ".join(argList),
                       working_directory=workdir, identifier=identifier,
                       dependencies=[cdir])
        job.run()
        finished = job.finished()
        if not finished:
            status = job.status()
            # The Subshell class provides the BaseMonitor interface, i.e.
            # finished()
            raise ModuleSuspended(self, '%s' % status, monitor=job)
        self.is_cacheable = lambda *args, **kwargs: True
        return job.standard_error()
Example #6
0
class RunJob(RQModule):
    """ Run an asynchronous command that can be detached and polled.
        This is preferable over RunCommand for long-running operations
    """

    _input_ports = [('machine', Machine),
                    ('command', '(edu.utah.sci.vistrails.basic:String)', True),
                    ('working_directory', '(edu.utah.sci.vistrails.basic:String)'),
                   ]

    _output_ports = [('stdout', '(edu.utah.sci.vistrails.basic:String)'),
                     ('stderr', '(edu.utah.sci.vistrails.basic:String)'),
                    ]

    job = None
    def job_read_inputs(self):
        d = {}
        if not self.has_input('command'):
            raise ModuleError(self, "No command specified")
        d['command'] = self.get_input('command').strip()
        d['working_directory'] = self.get_input('working_directory') \
              if self.has_input('working_directory') else '.'
        return d

    def job_start(self, params):
        work_dir = params['working_directory']
        self.machine = self.get_machine()
        use_machine(self.machine)
        self.job = Subshell("remote", params['command'], work_dir)
        self.job.run()
        ret = self.job._ret
        if ret:
            try:
                job_id = int(ret.split('\n')[0])
            except ValueError:
                end_machine()
                raise ModuleError(self, "Error submitting job: %s" % ret)
        self.set_job_machine(params, self.machine)
        return params

    def job_get_handle(self, params):
        if not self.job:
            self.job_start(params)
        return self.job

    def job_finish(self, params):
        params['stdout'] = self.job.standard_output()
        params['stderr'] = self.job.standard_error()
        if self.job.failed():
            self.job._pushw()
            code = self.job.terminal.cat("%s.failed" %
                                         self.job._identifier_filename)
            self.job._popw()
            end_machine()
            raise ModuleError(self,
                              "Command failed with exit code %s: %s" %
                               (code.strip(), params['stderr'].strip()))
        end_machine()
        return params

    def job_set_results(self, params):
        self.set_output('stdout', params['stdout'])
        self.set_output('stderr', params['stderr'])
Example #7
0
class HadoopStreaming(HadoopBaseModule):
    """
    The class for executing MapReduce using Hadoop Streaming with
    customized Python Mapper/Reducer/Combiner
    
    """
    _settings = ModuleSettings(namespace='hadoop')
    _input_ports = [IPort('Mapper',       File),
                    IPort('Reducer',      File),
                    IPort('Combiner',     File),
                    IPort('Workdir',      String),
                    IPort('Identifier',   String),
                    IPort('Input',        String),
                    IPort('Output',       String),
                    IPort('CacheFile',    String),
                    IPort('CacheArchive', String),
                    IPort('Environment',  String),
                    IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)')]

    _output_ports = [OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'),
                     OPort('Output', String)]

    def __init__(self):
        HadoopBaseModule.__init__(self)
        self.job = None
        self.job_machine = None

    def job_read_inputs(self):
        p = {}
        self.localMapper = self.force_get_input('Mapper')
        self.localReducer = self.force_get_input('Reducer')
        self.localCombiner = self.force_get_input('Combiner')
        p['workdir'] = self.force_get_input('Workdir')
        if p['workdir']==None:
            p['workdir'] = ".vistrails-hadoop"
        p['job_identifier'] = self.force_get_input('Identifier')
        if p['job_identifier'] == None:
            raise ModuleError(self, 'Job Identifier is required')
        p['input'] = self.force_get_input('Input')
        p['output'] = self.force_get_input('Output')
        if p['input']==None or p['output']==None:
            raise ModuleError(self, 'Input and Output are required')
        p['files'] = self.force_get_input_list('CacheFile')
        p['cacheArchives'] = self.force_get_input_list('CacheArchive')
        p['envVars'] = self.force_get_input_list('Environment') 
        return p

    def createJob(self, p):
        self.job_machine = self.get_machine()
        use_machine(self.job_machine)
        self.job = Subshell("remote", command="%s",
                            working_directory=p['workdir'],
                            identifier=p['job_identifier'])

    def job_start(self, p):
        self.createJob(p)
        if not self.job_machine.remote.isdir(p['workdir']):
            self.job_machine.remote.mkdir(p['workdir'])
        self.set_job_machine(p, self.job_machine)
        self.job.reset()

        # Now generate the command line
        config = self.get_hadoop_config(self.job_machine)
        command = 'jar %s' % config['streaming.jar']
        generics = ''
        arguments = ''

        if '://' not in p['input']:
            p['input'] = self.add_prefix(p['input'], self.job_machine)
        if '://' not in p['output']:
            p['output'] = self.add_prefix(p['output'], self.job_machine)
        arguments += ' -input %s -output %s' % (p['input'], p['output'])
        
        if self.localMapper!=None:
            tempfile = self.job_machine.remote.send_command('mktemp').strip()
            result = self.job_machine.sendfile(self.localMapper.name,tempfile)
            mapperFileName = os.path.split(tempfile)[1]
            p['files'].append(tempfile)
            arguments += ' -mapper %s' % mapperFileName
        else:
            arguments += ' -mapper org.apache.hadoop.mapred.lib.IdentityMapper'

        if self.localCombiner!=None:
            tempfile = self.job_machine.remote.send_command('mktemp').strip()
            result = self.job_machine.sendfile(self.localCombiner.name,
                                               tempfile)
            combinerFileName = os.path.split(tempfile)[1]
            p['files'].append(tempfile)
            arguments += ' -combiner %s' % combinerFileName

        if self.localReducer!=None:
            tempfile = self.job_machine.remote.send_command('mktemp').strip()
            result = self.job_machine.sendfile(self.localReducer.name,
                                                       tempfile)
            reducerFileName = os.path.split(tempfile)[1]
            p['files'].append(tempfile)
            arguments += ' -reducer %s' % reducerFileName
        else:
            arguments += ' -numReduceTasks 0'

        for var in p['envVars']:
            arguments += ' -cmdenv ' + var

        for cacheArchive in p['cacheArchives']:
            arguments += ' -cacheArchive %s' % cacheArchive

        #from init import configuration
        #if configuration.check('uris') and configuration.uris:
        #    for uri in configuration.uris.split(';'):
        #        p['files'].append(uri)
        # files is a generic command and needs to be first
        if p['files']:
            generics += ' -files ' + ','.join(p['files'])

        arguments = command + generics + arguments
        result = self.call_hadoop(arguments, p['workdir'],
                                  p['job_identifier'], self.job_machine)
        return p

    def job_get_handle(self, p):
        if not self.job:
            self.createJob(p)
        return self.job

    def job_finish(self, p):
        r = {}
        r['output'] = p['output']
        r['workdir'] = p['workdir']
        r['job_identifier'] = p['job_identifier']
        
        self.annotate({'hadoop_log':self.job.standard_error()})
        if self.job.failed():
            error = self.job.standard_error()
            raise ModuleError(self, error)
        return r

    def job_set_results(self, p):
        self.set_output('Output', p['output'])
        self.set_output('Machine', self.job_machine)

    def call_hadoop(self, arguments, workdir, identifier, machine):
        config = self.get_hadoop_config(machine)
        argList = [config['hadoop']]
        if type(arguments) in [str, unicode]:
            argList += arguments.split(' ')
        elif type(arguments)==list:
            argList += arguments
        else:
            raise ModuleError(self, 'Invalid argument types to hadoop')
        self.annotate({'hadoop_command':" ".join(argList)})
        self.job.command = self.job.command % " ".join(argList)
        self.job.run()
Example #8
0
class RunJob(RQModule):
    """ Run an asynchronous command that can be detached and polled.
        This is preferable over RunCommand for long-running operations
    """

    _input_ports = [
        ('machine', Machine),
        ('command', '(edu.utah.sci.vistrails.basic:String)', True),
        ('working_directory', '(edu.utah.sci.vistrails.basic:String)'),
    ]

    _output_ports = [
        ('stdout', '(edu.utah.sci.vistrails.basic:String)'),
        ('stderr', '(edu.utah.sci.vistrails.basic:String)'),
    ]

    job = None

    def job_read_inputs(self):
        d = {}
        if not self.has_input('command'):
            raise ModuleError(self, "No command specified")
        d['command'] = self.get_input('command').strip()
        d['working_directory'] = self.get_input('working_directory') \
              if self.has_input('working_directory') else '.'
        return d

    def job_start(self, params):
        work_dir = params['working_directory']
        self.machine = self.get_machine()
        use_machine(self.machine)
        self.job = Subshell("remote", params['command'], work_dir)
        self.job.run()
        ret = self.job._ret
        if ret:
            try:
                job_id = int(ret.split('\n')[0])
            except ValueError:
                end_machine()
                raise ModuleError(self, "Error submitting job: %s" % ret)
        self.set_job_machine(params, self.machine)
        return params

    def job_get_handle(self, params):
        if not self.job:
            self.job_start(params)
        return self.job

    def job_finish(self, params):
        params['stdout'] = self.job.standard_output()
        params['stderr'] = self.job.standard_error()
        if self.job.failed():
            self.job._pushw()
            code = self.job.terminal.cat("%s.failed" %
                                         self.job._identifier_filename)
            self.job._popw()
            end_machine()
            raise ModuleError(
                self, "Command failed with exit code %s: %s" %
                (code.strip(), params['stderr'].strip()))
        end_machine()
        return params

    def job_set_results(self, params):
        self.set_output('stdout', params['stdout'])
        self.set_output('stderr', params['stderr'])
Example #9
0
 def createJob(self, p):
     self.job_machine = self.get_machine()
     use_machine(self.job_machine)
     self.job = Subshell("remote", command="%s",
                         working_directory=p['workdir'],
                         identifier=p['job_identifier'])
Example #10
0
class HadoopStreaming(HadoopBaseModule):
    """
    The class for executing MapReduce using Hadoop Streaming with
    customized Python Mapper/Reducer/Combiner
    
    """
    _settings = ModuleSettings(namespace='hadoop')
    _input_ports = [IPort('Mapper',       File),
                    IPort('Reducer',      File),
                    IPort('Combiner',     File),
                    IPort('Workdir',      String),
                    IPort('Identifier',   String),
                    IPort('Input',        String),
                    IPort('Output',       String),
                    IPort('CacheFile',    String),
                    IPort('CacheArchive', String),
                    IPort('Environment',  String),
                    IPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)')]

    _output_ports = [OPort('Machine', '(org.vistrails.vistrails.remoteq:Machine)'),
                     OPort('Output', String)]

    def __init__(self):
        HadoopBaseModule.__init__(self)
        self.job = None
        self.job_machine = None

    def readInputs(self):
        p = {}
        self.localMapper = self.force_get_input('Mapper')
        self.localReducer = self.force_get_input('Reducer')
        self.localCombiner = self.force_get_input('Combiner')
        p['workdir'] = self.force_get_input('Workdir')
        if p['workdir']==None:
            p['workdir'] = ".vistrails-hadoop"
        p['job_identifier'] = self.force_get_input('Identifier')
        if p['job_identifier'] == None:
            raise ModuleError(self, 'Job Identifier is required')
        p['input'] = self.force_get_input('Input')
        p['output'] = self.force_get_input('Output')
        if p['input']==None or p['output']==None:
            raise ModuleError(self, 'Input and Output are required')
        p['files'] = self.force_get_input_list('CacheFile')
        p['cacheArchives'] = self.force_get_input_list('CacheArchive')
        p['envVars'] = self.force_get_input_list('Environment') 
        return p

    def createJob(self, p):
        self.job_machine = self.get_machine()
        use_machine(self.job_machine)
        self.job = Subshell("remote", command="%s",
                            working_directory=p['workdir'],
                            identifier=p['job_identifier'])

    def startJob(self, p):
        self.createJob(p)
        if not self.job_machine.remote.isdir(p['workdir']):
            self.job_machine.remote.mkdir(p['workdir'])
        self.set_job_machine(p, self.job_machine)
        self.job.reset()

        # Now generate the command line
        config = self.get_hadoop_config(self.job_machine)
        command = 'jar %s' % config['streaming.jar']
        generics = ''
        arguments = ''

        if '://' not in p['input']:
            p['input'] = self.add_prefix(p['input'], self.job_machine)
        if '://' not in p['output']:
            p['output'] = self.add_prefix(p['output'], self.job_machine)
        arguments += ' -input %s -output %s' % (p['input'], p['output'])
        
        if self.localMapper!=None:
            tempfile = self.job_machine.remote.send_command('mktemp').strip()
            result = self.job_machine.sendfile(self.localMapper.name,tempfile)
            mapperFileName = os.path.split(tempfile)[1]
            p['files'].append(tempfile)
            arguments += ' -mapper %s' % mapperFileName
        else:
            arguments += ' -mapper org.apache.hadoop.mapred.lib.IdentityMapper'

        if self.localCombiner!=None:
            tempfile = self.job_machine.remote.send_command('mktemp').strip()
            result = self.job_machine.sendfile(self.localCombiner.name,
                                               tempfile)
            combinerFileName = os.path.split(tempfile)[1]
            p['files'].append(tempfile)
            arguments += ' -combiner %s' % combinerFileName

        if self.localReducer!=None:
            tempfile = self.job_machine.remote.send_command('mktemp').strip()
            result = self.job_machine.sendfile(self.localReducer.name,
                                                       tempfile)
            reducerFileName = os.path.split(tempfile)[1]
            p['files'].append(tempfile)
            arguments += ' -reducer %s' % reducerFileName
        else:
            arguments += ' -numReduceTasks 0'

        for var in p['envVars']:
            arguments += ' -cmdenv ' + var

        for cacheArchive in p['cacheArchives']:
            arguments += ' -cacheArchive %s' % cacheArchive

        #from init import configuration
        #if configuration.check('uris') and configuration.uris:
        #    for uri in configuration.uris.split(';'):
        #        p['files'].append(uri)
        # files is a generic command and needs to be first
        if p['files']:
            generics += ' -files ' + ','.join(p['files'])

        arguments = command + generics + arguments
        result = self.call_hadoop(arguments, p['workdir'],
                                  p['job_identifier'], self.job_machine)
        return p

    def getMonitor(self, p):
        if not self.job:
            self.createJob(p)
        return self.job

    def finishJob(self, p):
        r = {}
        r['output'] = p['output']
        r['workdir'] = p['workdir']
        r['job_identifier'] = p['job_identifier']
        
        self.annotate({'hadoop_log':self.job.standard_error()})
        if self.job.failed():
            error = self.job.standard_error()
            raise ModuleError(self, error)
        return r

    def setResults(self, p):
        self.set_output('Output', p['output'])
        self.set_output('Machine', self.job_machine)

    def call_hadoop(self, arguments, workdir, identifier, machine):
        config = self.get_hadoop_config(machine)
        argList = [config['hadoop']]
        if type(arguments) in [str, unicode]:
            argList += arguments.split(' ')
        elif type(arguments)==list:
            argList += arguments
        else:
            raise ModuleError(self, 'Invalid argument types to hadoop')
        self.annotate({'hadoop_command':" ".join(argList)})
        self.job.command = self.job.command % " ".join(argList)
        self.job.run()
Example #11
0
class HadoopStreaming(HadoopBaseModule):
    """
    The class for executing MapReduce using Hadoop Streaming with
    customized Python Mapper/Reducer/Combiner
    
    """

    _settings = ModuleSettings(namespace="hadoop")
    _input_ports = [
        IPort("Mapper", File),
        IPort("Reducer", File),
        IPort("Combiner", File),
        IPort("Workdir", String),
        IPort("Identifier", String),
        IPort("Input", String),
        IPort("Output", String),
        IPort("CacheFile", String),
        IPort("CacheArchive", String),
        IPort("Environment", String),
        IPort("Machine", "(org.vistrails.vistrails.remoteq:Machine)"),
    ]

    _output_ports = [OPort("Machine", "(org.vistrails.vistrails.remoteq:Machine)"), OPort("Output", String)]

    def __init__(self):
        HadoopBaseModule.__init__(self)
        self.job = None
        self.job_machine = None

    def job_read_inputs(self):
        p = {}
        self.localMapper = self.force_get_input("Mapper")
        self.localReducer = self.force_get_input("Reducer")
        self.localCombiner = self.force_get_input("Combiner")
        p["workdir"] = self.force_get_input("Workdir")
        if p["workdir"] == None:
            p["workdir"] = ".vistrails-hadoop"
        p["job_identifier"] = self.force_get_input("Identifier")
        if p["job_identifier"] is None:
            raise ModuleError(self, "Job Identifier is required")
        p["input"] = self.force_get_input("Input")
        p["output"] = self.force_get_input("Output")
        if p["input"] == None or p["output"] == None:
            raise ModuleError(self, "Input and Output are required")
        p["files"] = self.force_get_input_list("CacheFile")
        p["cacheArchives"] = self.force_get_input_list("CacheArchive")
        p["envVars"] = self.force_get_input_list("Environment")
        return p

    def createJob(self, p):
        self.job_machine = self.get_machine()
        use_machine(self.job_machine)
        self.job = Subshell("remote", command="%s", working_directory=p["workdir"], identifier=p["job_identifier"])

    def job_start(self, p):
        self.createJob(p)
        if not self.job_machine.remote.isdir(p["workdir"]):
            self.job_machine.remote.mkdir(p["workdir"])
        self.set_job_machine(p, self.job_machine)
        self.job.reset()

        # Now generate the command line
        config = self.get_hadoop_config(self.job_machine)
        command = "jar %s" % config["streaming.jar"]
        generics = ""
        arguments = ""

        if "://" not in p["input"]:
            p["input"] = self.add_prefix(p["input"], self.job_machine)
        if "://" not in p["output"]:
            p["output"] = self.add_prefix(p["output"], self.job_machine)
        arguments += " -input %s -output %s" % (p["input"], p["output"])

        if self.localMapper != None:
            tempfile = self.job_machine.remote.send_command("mktemp").strip()
            result = self.job_machine.sendfile(self.localMapper.name, tempfile)
            mapperFileName = os.path.split(tempfile)[1]
            p["files"].append(tempfile)
            arguments += " -mapper %s" % mapperFileName
        else:
            arguments += " -mapper org.apache.hadoop.mapred.lib.IdentityMapper"

        if self.localCombiner != None:
            tempfile = self.job_machine.remote.send_command("mktemp").strip()
            result = self.job_machine.sendfile(self.localCombiner.name, tempfile)
            combinerFileName = os.path.split(tempfile)[1]
            p["files"].append(tempfile)
            arguments += " -combiner %s" % combinerFileName

        if self.localReducer != None:
            tempfile = self.job_machine.remote.send_command("mktemp").strip()
            result = self.job_machine.sendfile(self.localReducer.name, tempfile)
            reducerFileName = os.path.split(tempfile)[1]
            p["files"].append(tempfile)
            arguments += " -reducer %s" % reducerFileName
        else:
            arguments += " -numReduceTasks 0"

        for var in p["envVars"]:
            arguments += " -cmdenv " + var

        for cacheArchive in p["cacheArchives"]:
            arguments += " -cacheArchive %s" % cacheArchive

        # from init import configuration
        # if configuration.check('uris') and configuration.uris:
        #    for uri in configuration.uris.split(';'):
        #        p['files'].append(uri)
        # files is a generic command and needs to be first
        if p["files"]:
            generics += " -files " + ",".join(p["files"])

        arguments = command + generics + arguments
        result = self.call_hadoop(arguments, p["workdir"], p["job_identifier"], self.job_machine)
        return p

    def job_get_handle(self, p):
        if not self.job:
            self.createJob(p)
        return self.job

    def job_finish(self, p):
        r = {}
        r["output"] = p["output"]
        r["workdir"] = p["workdir"]
        r["job_identifier"] = p["job_identifier"]

        self.annotate({"hadoop_log": self.job.standard_error()})
        if self.job.failed():
            error = self.job.standard_error()
            raise ModuleError(self, error)
        return r

    def job_set_results(self, p):
        self.set_output("Output", p["output"])
        self.set_output("Machine", self.job_machine)

    def call_hadoop(self, arguments, workdir, identifier, machine):
        config = self.get_hadoop_config(machine)
        argList = [config["hadoop"]]
        if type(arguments) in [str, unicode]:
            argList += arguments.split(" ")
        elif type(arguments) == list:
            argList += arguments
        else:
            raise ModuleError(self, "Invalid argument types to hadoop")
        self.annotate({"hadoop_command": " ".join(argList)})
        self.job.command = self.job.command % " ".join(argList)
        self.job.run()