Ejemplo n.º 1
0
def setup_paths(module_paths):
    """Set up sys.path on the mappers and reducers.

    module_paths is an array of path names where the sources or other
    supporting files are found. In particular, module_paths[0] is the location
    of the PyCascading Python sources, and modules_paths[1] is the location of
    the source file defining the function.

    In Hadoop mode (with remote_deploy.sh), the first two -a options must
    specify the archives of the PyCascading sources and the job sources,
    respectively.

    Arguments:
    module_paths -- the locations of the Python sources 
    """
    from com.twitter.pycascading import Util

    cascading_jar = Util.getCascadingJar()
    jython_dir = module_paths[0]

    sys.path.extend(
        (cascading_jar, jython_dir + '/python', jython_dir + '/python/Lib'))
    sys.path.extend(module_paths[1:])

    # Allow importing of user-installed Jython packages
    # Thanks to Simon Radford
    import site
    site.addsitedir(jython_dir + 'python/Lib/site-packages')
Ejemplo n.º 2
0
 def run(self, num_reducers=100, config=None):
     """Start the Cascading job.
     
     We call this when we are done building the pipeline and explicitly want
     to start the flow process.
     """
     sources_used = set([])
     for tail in self.tails:
         sources_used.update(tail.context)
     # Remove unused sources from the source map
     source_map = {}
     for source in self.source_map.iterkeys():
         if source in sources_used:
             source_map[source] = self.source_map[source]
     tails = [t.get_assembly() for t in self.tails]
     Util.run(num_reducers, config, source_map, self.sink_map, tails)
Ejemplo n.º 3
0
    def run(self, num_reducers=50, config=None):
        """Start the Cascading job.

        We call this when we are done building the pipeline and explicitly want
        to start the flow process.
        """
        sources_used = set([])
        for tail in self.tails:
            sources_used.update(tail.context)
        # Remove unused sources from the source map
        source_map = {}
        for source in self.source_map.iterkeys():
            if source in sources_used:
                source_map[source] = self.source_map[source]
        tails = [t.get_assembly() for t in self.tails]
        Util.run(num_reducers, config, source_map, self.sink_map, tails)
Ejemplo n.º 4
0
def setup_paths(module_paths):
    """Set up sys.path on the mappers and reducers.

    module_paths is an array of path names where the sources or other
    supporting files are found. In particular, module_paths[0] is the location
    of the PyCascading Python sources, and modules_paths[1] is the location of
    the source file defining the function.

    In Hadoop mode (with remote_deploy.sh), the first two -a options must
    specify the archives of the PyCascading sources and the job sources,
    respectively.

    Arguments:
    module_paths -- the locations of the Python sources 
    """
    from com.twitter.pycascading import Util

    cascading_jar = Util.getCascadingJar()
    jython_dir = module_paths[0]

    sys.path.extend((cascading_jar, jython_dir + '/python',
                     jython_dir + '/python/Lib'))
    sys.path.extend(module_paths[1 : ])

    # Allow importing of user-installed Jython packages
    # Thanks to Simon Radford
    import site
    site.addsitedir(jython_dir + 'python/Lib/site-packages')
Ejemplo n.º 5
0
def load_source(module_name, file_name):
    """Loads the given module from a Python source file.
    
    Arguments:
    module_name -- the name of the variable read the module into
    file_name -- the file that contains the source for the module 
    """
    from com.twitter.pycascading import Util

    cascading_jar = Util.getCascadingJar()
    tmp_dir = Util.getJarFolder()
    sys.path.extend((cascading_jar, tmp_dir + '/python',
                     tmp_dir + '/python/Lib'))
    
    # Haha... it's necessary to put this here, otherwise simplejson won't work.
    # Maybe it's automatically imported in the beginning of a Jython program,
    # but since at that point the sys.path is not set yet to Lib, it will fail?
    #import encodings

    return imp.load_source(module_name, file_name)
Ejemplo n.º 6
0
def load_source(module_name, file_name):
    """Loads the given module from a Python source file.
    
    Arguments:
    module_name -- the name of the variable read the module into
    file_name -- the file that contains the source for the module 
    """
    from com.twitter.pycascading import Util

    cascading_jar = Util.getJarFolder()
    tmp_dir = _remove_last_dir(_remove_last_dir(cascading_jar))
    sys.path.extend(
        (cascading_jar, tmp_dir + '/python', tmp_dir + '/python/Lib'))

    # Haha... it's necessary to put this here, otherwise simplejson won't work.
    # Maybe it's automatically imported in the beginning of a Jython program,
    # but since at that point the sys.path is not set yet to Lib, it will fail?
    import encodings

    return imp.load_source(module_name, file_name)
Ejemplo n.º 7
0
def load_source(module_name, file_name, module_paths):
    """Loads the given module from a Python source file.

    This function is called by PythonFunctionWrapper.prepare(...) after it
    started the Python interpreter to request the given source file to be
    loaded. The function is to be found in this source file.

    module_paths is an array of path names where the sources or other
    supporting files are found. In particular, module_paths[0] is the location
    of the PyCascading Python sources, and modules_paths[1] is the location of
    the source file defining the function.

    In Hadoop mode (with remote_deploy.sh), the first two -a options must
    specify the archives of the PyCascading sources and the job sources,
    respectively.

    Arguments:
    module_name -- the name of the variable read the module into
    file_name -- the file that contains the source for the module
    module_paths -- the locations of the Python sources 
    """
    # This one should be on the classpath from the job jar or the extracted jar
    from com.twitter.pycascading import Util

    cascading_jar = Util.getCascadingJar()
    jython_dir = module_paths[0]

    sys.path.extend((cascading_jar, jython_dir + '/python',
                     jython_dir + '/python/Lib'))
    sys.path.extend(module_paths[1 : ])

    # Allow importing of user-installed Jython packages
    import site
    site.addsitedir(jython_dir + 'python/Lib/site-packages')

    # Haha... it's necessary to put this here, otherwise simplejson won't work.
    # Maybe it's automatically imported in the beginning of a Jython program,
    # but since at that point the sys.path is not set yet to Lib, it will fail?
    #import encodings

    return imp.load_source(module_name, file_name)
Ejemplo n.º 8
0
"""

__author__ = 'Gabor Szabo'


import sys, imp


if __name__ == "__main__":
    # The first command line parameter must be 'hadoop' or 'local'
    # to indicate the running mode
    running_mode = sys.argv[1]

    from com.twitter.pycascading import Util

    cascading_jar = Util.getCascadingJar()
    # This is the folder where Hadoop extracted the jar file for execution
    tmp_dir = Util.getJarFolder()

    # The initial value of sys.path is JYTHONPATH plus whatever Jython appends
    # to it (normally the Python standard libraries the come with Jython)
    sys.path.extend((cascading_jar, '.', tmp_dir, tmp_dir + 'python',
                     tmp_dir + 'python/Lib'))

    # Haha... it's necessary to put this here, otherwise simplejson won't work.
    # Maybe it's automatically imported in the beginning of a Jython program,
    # but since at that point the sys.path is not set yet to Lib, it will fail?
    # Instead, we can use Java's JSON decoder...
#    import encodings

    m = imp.load_source('main', sys.argv[2])
Ejemplo n.º 9
0
                     tmp_dir + '/python/Lib'))
    
    # Haha... it's necessary to put this here, otherwise simplejson won't work.
    # Maybe it's automatically imported in the beginning of a Jython program,
    # but since at that point the sys.path is not set yet to Lib, it will fail?
    import encodings

    return imp.load_source(module_name, file_name)


if __name__ == "__main__":
    running_mode = sys.argv[1]
    
    from com.twitter.pycascading import Util

    cascading_jar = Util.getJarFolder()
    # This is the folder where Hadoop extracted the jar file for execution
    tmp_dir = _remove_last_dir(_remove_last_dir(cascading_jar))
    sys.path.extend((cascading_jar, '.', tmp_dir, tmp_dir + '/python',
                     tmp_dir + '/python/Lib'))
    
    # Haha... it's necessary to put this here, otherwise simplejson won't work.
    # Maybe it's automatically imported in the beginning of a Jython program,
    # but since at that point the sys.path is not set yet to Lib, it will fail?
    # Instead, we can use Java's JSON decoder...
    import encodings

    m = imp.load_source('main', sys.argv[2])
    # We need to explicitly inject running_mode into the tap modules,
    # otherwise we cannot import bootstrap from tap and use the
    # bootstrap.running_mode like that
Ejemplo n.º 10
0
        (cascading_jar, tmp_dir + '/python', tmp_dir + '/python/Lib'))

    # Haha... it's necessary to put this here, otherwise simplejson won't work.
    # Maybe it's automatically imported in the beginning of a Jython program,
    # but since at that point the sys.path is not set yet to Lib, it will fail?
    import encodings

    return imp.load_source(module_name, file_name)


if __name__ == "__main__":
    running_mode = sys.argv[1]

    from com.twitter.pycascading import Util

    cascading_jar = Util.getJarFolder()
    # This is the folder where Hadoop extracted the jar file for execution
    tmp_dir = _remove_last_dir(_remove_last_dir(cascading_jar))
    sys.path.extend((cascading_jar, '.', tmp_dir, tmp_dir + '/python',
                     tmp_dir + '/python/Lib'))

    # Haha... it's necessary to put this here, otherwise simplejson won't work.
    # Maybe it's automatically imported in the beginning of a Jython program,
    # but since at that point the sys.path is not set yet to Lib, it will fail?
    # Instead, we can use Java's JSON decoder...
    import encodings

    m = imp.load_source('main', sys.argv[2])
    # We need to explicitly inject running_mode into the tap modules,
    # otherwise we cannot import bootstrap from tap and use the
    # bootstrap.running_mode like that
Ejemplo n.º 11
0
    # The first command line parameter must be 'hadoop' or 'local'
    # to indicate the running mode
    running_mode = sys.argv[1]

    # The second is the location of the PyCascading Python sources in local
    # mode, and the PyCascading tarball in Hadoop mode
    python_dir = sys.argv[2]

    # Remove the first two arguments so that sys.argv will look like as
    # if it was coming from a simple command line execution
    # The further parameters are the command line parameters to the script
    sys.argv = sys.argv[3:]

    from com.twitter.pycascading import Util

    cascading_jar = Util.getCascadingJar()
    # This is the folder where Hadoop extracted the jar file for execution
    tmp_dir = Util.getJarFolder()

    Util.setPycascadingRoot(python_dir)

    # The initial value of sys.path is JYTHONPATH plus whatever Jython appends
    # to it (normally the Python standard libraries the come with Jython)
    sys.path.extend((cascading_jar, '.', tmp_dir, python_dir + '/python',
                     python_dir + '/python/Lib'))

    # Allow the importing of user-installed Jython packages
    import site
    site.addsitedir(python_dir + 'python/Lib/site-packages')

    import os
Ejemplo n.º 12
0
    # The first command line parameter must be 'hadoop' or 'local'
    # to indicate the running mode
    running_mode = sys.argv[1]

    # The second is the location of the PyCascading Python sources in local
    # mode, and the PyCascading tarball in Hadoop mode
    python_dir = sys.argv[2]

    # Remove the first two arguments so that sys.argv will look like as
    # if it was coming from a simple command line execution
    # The further parameters are the command line parameters to the script
    sys.argv = sys.argv[3:]

    from com.twitter.pycascading import Util

    cascading_jar = Util.getCascadingJar()
    # This is the folder where Hadoop extracted the jar file for execution
    tmp_dir = Util.getJarFolder()

    Util.setPycascadingRoot(python_dir)

    # The initial value of sys.path is JYTHONPATH plus whatever Jython appends
    # to it (normally the Python standard libraries the come with Jython)
    sys.path.extend((cascading_jar, '.', tmp_dir, python_dir + '/python',
                     python_dir + '/python/Lib'))

    # Allow the importing of user-installed Jython packages
    import site
    site.addsitedir(python_dir + 'python/Lib/site-packages')

    import os