Ejemplo n.º 1
0
def createHadoopWordcountDefinition():
    parameterOrdering = DefinitionModule.createParameterOrderingTable()
    row = parameterOrdering.addRow()
    row.setColumn('source', 'input file')
    row.setColumn('target', 'output file')

    # TODO:
    # need to be able to customize this for each host
    executable = HadoopModule.JarExecutable()
    executable.stageable(False)
    executable.path([HadoopModule.getExecutablePath()])
    executable.jarFile([getExamplesJar()])
    executable.jarClass(['wordcount'])
    
    definition = DefinitionModule.createShellProcessDefinition(
        inputParameters = {
            'input file':{
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True,
                ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True,
            },
            'output file':{
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True,
                ParameterModule.PORT_ATTRIBUTE_ISSIDEEFFECT:True,
            }
        },
        parameterOrderings = parameterOrdering,
        executable = executable
    )

    return definition
Ejemplo n.º 2
0
def createHadoopStreamingDefinition():
    parameterOrdering = DefinitionModule.createParameterOrderingTable()
    row = parameterOrdering.addRow()
    row.setColumn('source', 'input file')
    row.setColumn('target', 'output file')
    row = parameterOrdering.addRow()
    row.setColumn('source', 'output file')
    row.setColumn('target', 'mapper')
    row = parameterOrdering.addRow()
    row.setColumn('source', 'mapper')
    row.setColumn('target', 'reducer')

    
    # TODO:
    # need to be able to customize this for each host
    executable = HadoopModule.JarExecutable()
    executable.stageable(False)
    executable.path([HadoopModule.getExecutablePath()])
    executable.jarFile([getStreamingJar()])
    executable.jarClass([])
    
    
    definition = DefinitionModule.createShellProcessDefinition(
        inputParameters = {
            'input file':{
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True,
                ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True,
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{
                    ParameterModule.COMMANDLINE_PREFIX_FLAG:['-input']
                    },
            },
            'output file':{
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True,
                ParameterModule.PORT_ATTRIBUTE_ISSIDEEFFECT:True,
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{
                    ParameterModule.COMMANDLINE_PREFIX_FLAG:['-output']
                    },
            },
            'mapper':{
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True,
                ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True,
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{
                    ParameterModule.COMMANDLINE_PREFIX_FLAG:['-mapper']
                    },
            },
            'reducer':{
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True,
                ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True,
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{
                    ParameterModule.COMMANDLINE_PREFIX_FLAG:['-reducer']
                    },
            },
        },
        parameterOrderings = parameterOrdering,
        executable = executable
    )

    return definition
Ejemplo n.º 3
0
    def setUp(self):

        self.assertTrue(os.path.exists(HadoopModule.getExecutablePath()))

        BaseModule.BaseTestClass.setUp(self)

        # TODO:
        # will need to place test data into HDFS
        
        self.removeOutputFiles()
        return
Ejemplo n.º 4
0
    def testCreatePipesExecutable(self):

        path = os.path.sep.join(['', 'path', 'to', 'pipes'])
        executableObject = self.builder.createPipesExecutableObject(path)
        
        self.assertTrue(isinstance(executableObject, HadoopModule.PipesExecutable))
        self.assertFalse(executableObject.stageable())
        self.assertEquals([HadoopModule.getExecutablePath()],
                          executableObject.path())
        self.assertEquals([path],
                          executableObject.pipesFile())

        return
Ejemplo n.º 5
0
    def testCreateStreamingExecutable(self):

        executableObject = self.builder.createStreamingExecutableObject()

        self.assertTrue(isinstance(executableObject, HadoopModule.JarExecutable))
        self.assertFalse(executableObject.stageable())
        self.assertEquals([HadoopModule.getExecutablePath()],
                          executableObject.path())
        self.assertEquals([HadoopModule.getStreamingJar()],
                          executableObject.jarFile())
        self.assertEquals([],
                          executableObject.jarClass())

        return
Ejemplo n.º 6
0
    def testCreateDefaultExecutable(self):

        jarFile = 'myJarFile'
        jarClass = 'myJarClass'
        executableObject = self.builder.createExecutableObject(jarFile, jarClass)

        self.assertTrue(isinstance(executableObject, HadoopModule.JarExecutable))
        self.assertFalse(executableObject.stageable())
        self.assertEquals([HadoopModule.getExecutablePath()],
                          executableObject.path())
        self.assertEquals([jarFile],
                          executableObject.jarFile())
        self.assertEquals([jarClass],
                          executableObject.jarClass())

        return
Ejemplo n.º 7
0
def createHadoopPipesDefinition():

    parameterOrdering = DefinitionModule.createParameterOrderingTable()
    row = parameterOrdering.addRow()
    row.setColumn('source', 'input file')
    row.setColumn('target', 'output file')
    
    # TODO:
    # need to be able to customize this for each host
    command = ['pipesProgram']
    executable = HadoopModule.PipesExecutable()
    executable.stageable(False)
    executable.path([HadoopModule.getExecutablePath()])
    executable.pipesFile(command)
    
    
    definition = DefinitionModule.createShellProcessDefinition(
        inputParameters = {
            'input file':{
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True,
                ParameterModule.PORT_ATTRIBUTE_ISINPUTFILE:True,
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{
                    ParameterModule.COMMANDLINE_PREFIX_FLAG:['-input']
                    },
            },
            'output file':{
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE:True,
                ParameterModule.PORT_ATTRIBUTE_ISSIDEEFFECT:True,
                ParameterModule.PORT_ATTRIBUTE_COMMANDLINE_OPTIONS:{
                    ParameterModule.COMMANDLINE_PREFIX_FLAG:['-output']
                    },
            },
        },
        parameterOrderings = parameterOrdering,
        executable = executable
    )

    return definition
Ejemplo n.º 8
0
 def fileExists(self, file):
     f = os.system('%s fs -stat %s' % (HadoopModule.getExecutablePath(), file))
     return f is 0
Ejemplo n.º 9
0
 def removeOutputFiles(self):
     # ensure that the outdir does not exist
     if self.fileExists(TestHadoopStreaming1.DIR_OUTPUT):
         os.system('%s fs -rmr %s' % (HadoopModule.getExecutablePath(), TestHadoopStreaming1.DIR_OUTPUT))
     return