def setup_paths(module_paths): """Set up sys.path on the mappers and reducers. module_paths is an array of path names where the sources or other supporting files are found. In particular, module_paths[0] is the location of the PyCascading Python sources, and modules_paths[1] is the location of the source file defining the function. In Hadoop mode (with remote_deploy.sh), the first two -a options must specify the archives of the PyCascading sources and the job sources, respectively. Arguments: module_paths -- the locations of the Python sources """ from com.twitter.pycascading import Util cascading_jar = Util.getCascadingJar() jython_dir = module_paths[0] sys.path.extend( (cascading_jar, jython_dir + '/python', jython_dir + '/python/Lib')) sys.path.extend(module_paths[1:]) # Allow importing of user-installed Jython packages # Thanks to Simon Radford import site site.addsitedir(jython_dir + 'python/Lib/site-packages')
def run(self, num_reducers=100, config=None): """Start the Cascading job. We call this when we are done building the pipeline and explicitly want to start the flow process. """ sources_used = set([]) for tail in self.tails: sources_used.update(tail.context) # Remove unused sources from the source map source_map = {} for source in self.source_map.iterkeys(): if source in sources_used: source_map[source] = self.source_map[source] tails = [t.get_assembly() for t in self.tails] Util.run(num_reducers, config, source_map, self.sink_map, tails)
def run(self, num_reducers=50, config=None): """Start the Cascading job. We call this when we are done building the pipeline and explicitly want to start the flow process. """ sources_used = set([]) for tail in self.tails: sources_used.update(tail.context) # Remove unused sources from the source map source_map = {} for source in self.source_map.iterkeys(): if source in sources_used: source_map[source] = self.source_map[source] tails = [t.get_assembly() for t in self.tails] Util.run(num_reducers, config, source_map, self.sink_map, tails)
def setup_paths(module_paths): """Set up sys.path on the mappers and reducers. module_paths is an array of path names where the sources or other supporting files are found. In particular, module_paths[0] is the location of the PyCascading Python sources, and modules_paths[1] is the location of the source file defining the function. In Hadoop mode (with remote_deploy.sh), the first two -a options must specify the archives of the PyCascading sources and the job sources, respectively. Arguments: module_paths -- the locations of the Python sources """ from com.twitter.pycascading import Util cascading_jar = Util.getCascadingJar() jython_dir = module_paths[0] sys.path.extend((cascading_jar, jython_dir + '/python', jython_dir + '/python/Lib')) sys.path.extend(module_paths[1 : ]) # Allow importing of user-installed Jython packages # Thanks to Simon Radford import site site.addsitedir(jython_dir + 'python/Lib/site-packages')
def load_source(module_name, file_name): """Loads the given module from a Python source file. Arguments: module_name -- the name of the variable read the module into file_name -- the file that contains the source for the module """ from com.twitter.pycascading import Util cascading_jar = Util.getCascadingJar() tmp_dir = Util.getJarFolder() sys.path.extend((cascading_jar, tmp_dir + '/python', tmp_dir + '/python/Lib')) # Haha... it's necessary to put this here, otherwise simplejson won't work. # Maybe it's automatically imported in the beginning of a Jython program, # but since at that point the sys.path is not set yet to Lib, it will fail? #import encodings return imp.load_source(module_name, file_name)
def load_source(module_name, file_name): """Loads the given module from a Python source file. Arguments: module_name -- the name of the variable read the module into file_name -- the file that contains the source for the module """ from com.twitter.pycascading import Util cascading_jar = Util.getJarFolder() tmp_dir = _remove_last_dir(_remove_last_dir(cascading_jar)) sys.path.extend( (cascading_jar, tmp_dir + '/python', tmp_dir + '/python/Lib')) # Haha... it's necessary to put this here, otherwise simplejson won't work. # Maybe it's automatically imported in the beginning of a Jython program, # but since at that point the sys.path is not set yet to Lib, it will fail? import encodings return imp.load_source(module_name, file_name)
def load_source(module_name, file_name, module_paths): """Loads the given module from a Python source file. This function is called by PythonFunctionWrapper.prepare(...) after it started the Python interpreter to request the given source file to be loaded. The function is to be found in this source file. module_paths is an array of path names where the sources or other supporting files are found. In particular, module_paths[0] is the location of the PyCascading Python sources, and modules_paths[1] is the location of the source file defining the function. In Hadoop mode (with remote_deploy.sh), the first two -a options must specify the archives of the PyCascading sources and the job sources, respectively. Arguments: module_name -- the name of the variable read the module into file_name -- the file that contains the source for the module module_paths -- the locations of the Python sources """ # This one should be on the classpath from the job jar or the extracted jar from com.twitter.pycascading import Util cascading_jar = Util.getCascadingJar() jython_dir = module_paths[0] sys.path.extend((cascading_jar, jython_dir + '/python', jython_dir + '/python/Lib')) sys.path.extend(module_paths[1 : ]) # Allow importing of user-installed Jython packages import site site.addsitedir(jython_dir + 'python/Lib/site-packages') # Haha... it's necessary to put this here, otherwise simplejson won't work. # Maybe it's automatically imported in the beginning of a Jython program, # but since at that point the sys.path is not set yet to Lib, it will fail? #import encodings return imp.load_source(module_name, file_name)
""" __author__ = 'Gabor Szabo' import sys, imp if __name__ == "__main__": # The first command line parameter must be 'hadoop' or 'local' # to indicate the running mode running_mode = sys.argv[1] from com.twitter.pycascading import Util cascading_jar = Util.getCascadingJar() # This is the folder where Hadoop extracted the jar file for execution tmp_dir = Util.getJarFolder() # The initial value of sys.path is JYTHONPATH plus whatever Jython appends # to it (normally the Python standard libraries the come with Jython) sys.path.extend((cascading_jar, '.', tmp_dir, tmp_dir + 'python', tmp_dir + 'python/Lib')) # Haha... it's necessary to put this here, otherwise simplejson won't work. # Maybe it's automatically imported in the beginning of a Jython program, # but since at that point the sys.path is not set yet to Lib, it will fail? # Instead, we can use Java's JSON decoder... # import encodings m = imp.load_source('main', sys.argv[2])
tmp_dir + '/python/Lib')) # Haha... it's necessary to put this here, otherwise simplejson won't work. # Maybe it's automatically imported in the beginning of a Jython program, # but since at that point the sys.path is not set yet to Lib, it will fail? import encodings return imp.load_source(module_name, file_name) if __name__ == "__main__": running_mode = sys.argv[1] from com.twitter.pycascading import Util cascading_jar = Util.getJarFolder() # This is the folder where Hadoop extracted the jar file for execution tmp_dir = _remove_last_dir(_remove_last_dir(cascading_jar)) sys.path.extend((cascading_jar, '.', tmp_dir, tmp_dir + '/python', tmp_dir + '/python/Lib')) # Haha... it's necessary to put this here, otherwise simplejson won't work. # Maybe it's automatically imported in the beginning of a Jython program, # but since at that point the sys.path is not set yet to Lib, it will fail? # Instead, we can use Java's JSON decoder... import encodings m = imp.load_source('main', sys.argv[2]) # We need to explicitly inject running_mode into the tap modules, # otherwise we cannot import bootstrap from tap and use the # bootstrap.running_mode like that
(cascading_jar, tmp_dir + '/python', tmp_dir + '/python/Lib')) # Haha... it's necessary to put this here, otherwise simplejson won't work. # Maybe it's automatically imported in the beginning of a Jython program, # but since at that point the sys.path is not set yet to Lib, it will fail? import encodings return imp.load_source(module_name, file_name) if __name__ == "__main__": running_mode = sys.argv[1] from com.twitter.pycascading import Util cascading_jar = Util.getJarFolder() # This is the folder where Hadoop extracted the jar file for execution tmp_dir = _remove_last_dir(_remove_last_dir(cascading_jar)) sys.path.extend((cascading_jar, '.', tmp_dir, tmp_dir + '/python', tmp_dir + '/python/Lib')) # Haha... it's necessary to put this here, otherwise simplejson won't work. # Maybe it's automatically imported in the beginning of a Jython program, # but since at that point the sys.path is not set yet to Lib, it will fail? # Instead, we can use Java's JSON decoder... import encodings m = imp.load_source('main', sys.argv[2]) # We need to explicitly inject running_mode into the tap modules, # otherwise we cannot import bootstrap from tap and use the # bootstrap.running_mode like that
# The first command line parameter must be 'hadoop' or 'local' # to indicate the running mode running_mode = sys.argv[1] # The second is the location of the PyCascading Python sources in local # mode, and the PyCascading tarball in Hadoop mode python_dir = sys.argv[2] # Remove the first two arguments so that sys.argv will look like as # if it was coming from a simple command line execution # The further parameters are the command line parameters to the script sys.argv = sys.argv[3:] from com.twitter.pycascading import Util cascading_jar = Util.getCascadingJar() # This is the folder where Hadoop extracted the jar file for execution tmp_dir = Util.getJarFolder() Util.setPycascadingRoot(python_dir) # The initial value of sys.path is JYTHONPATH plus whatever Jython appends # to it (normally the Python standard libraries the come with Jython) sys.path.extend((cascading_jar, '.', tmp_dir, python_dir + '/python', python_dir + '/python/Lib')) # Allow the importing of user-installed Jython packages import site site.addsitedir(python_dir + 'python/Lib/site-packages') import os