def azkaban_mongo_job(): project_2 = Project('LeadDataStats') project_2.add_job('UploadDataStats', Job({'type': 'command', 'command': 'python /home/msingh/Documents/' 'PycharmProjects/AzkabanTest/mongo/MongoDataPush.py'}, {'dependencies': 'MongoStart'})) project_3 = Project('MongoDataUpload') project_3.add_job('UploadDataStatus', Job({'type': 'command', 'command': 'echo "Data successfully uploaded"'}, {'dependencies': 'UploadDataStats'}))
def build_project(project_name, global_props, project_props, jobs, files, version): logger.info("Building workflow %s, version: %s.", project_name, version) project = Project(project_name, root=os.curdir, version=version) project.properties = global_props project.properties.update(project_props) for job_name, job_definition in jobs.items(): project.add_job(job_name, Job(job_definition)) for file, target in files: project.add_file(file, target) return project
}, 'jvm.args.mapred': { 'max.split.size': 2684354560, 'min.split.size': 2684354560, }, } # list of pig job options OPTIONS = [ { 'pig.script': 'first.pig' }, { 'pig.script': 'second.pig', 'dependencies': 'first.pig' }, { 'pig.script': 'third.pig', 'param': { 'foo': 48 } }, { 'pig.script': 'fourth.pig', 'dependencies': 'second.pig,third.pig' }, ] for option in OPTIONS: PROJECT.add_job(option['pig.script'], PigJob(DEFAULTS, option))
#!/usr/bin/env python # encoding: utf-8 """Simple Azkaban project configuration script.""" from azkaban import Job, Project project = Project('foo') project.add_job('bar', Job({'type': 'command', 'command': 'echo "hi!"'})) if __name__ == '__main__': project.main()
'basic_flow': Job({'type': 'noop' , 'dependencies': 'basic_step_5.cmd,basic_step_6.cmd,basic_step_7.cmd,basic_step_8.cmd'}), # `template_flow` example # • Demonstrates using one flow as a "template" that is embedded in another flow and reused multiple times. # • The only work performed by job in this example template is to echo out the variables it receives to the log. # NOTE: We have to `chmod 777` our script to make sure Azkaban can run it. '_template_chmod.cmd': Job({'type': 'command', 'command': 'chmod 777 _echo.sh'}), '_template_echo_1.cmd': Job({'type': 'command', 'command': './_echo.sh "echo_1" ${project_1} ${custom_1} ${custom_2}', 'dependencies': '_template_chmod.cmd'}), '_template_echo_2.cmd': Job({'type': 'command', 'command': './_echo.sh "echo_2" ${project_1} ${custom_1} ${custom_2}', 'dependencies': '_template_echo_1.cmd'}), '_template': Job({'type': 'noop' , 'dependencies': '_template_echo_2.cmd'}), # • Each of the following subflows embeds *ALL* of the steps from `_template` using the `flow.name` key. # • Each defines `custom_1` and `custom_2` keys which are passed as variables ${custom_1} and ${custom_2} to `_template` during execution. 'start.noop': Job({'type': 'noop'}), 'subflow_1.flow': Job({'type': 'flow', 'flow.name': '_template', 'dependencies': 'start.noop', 'custom_1': 'subflow1-val1', 'custom_2': 'subflow1-val2'}), 'subflow_2.flow': Job({'type': 'flow', 'flow.name': '_template', 'dependencies': 'start.noop', 'custom_1': 'subflow2-val1', 'custom_2': 'subflow2-val2'}), 'subflow_3.flow': Job({'type': 'flow', 'flow.name': '_template', 'dependencies': 'start.noop', 'custom_1': 'subflow3-val1', 'custom_2': 'subflow3-val2'}), 'subflow_4.flow': Job({'type': 'flow', 'flow.name': '_template', 'dependencies': 'start.noop', 'custom_1': 'subflow4-val1', 'custom_2': 'subflow4-val2'}), 'workflow': Job({'type': 'noop', 'dependencies': 'subflow_1.flow,subflow_2.flow,subflow_3.flow,subflow_4.flow'}) } for name, job in JOBS.items(): PROJECT.add_job(name, job) # The CLI requires any non-job files to be explicitly included. # Must declare the `root` in the project in order for this to work. FILES = { './_echo.sh': '_echo.sh' } for file, name in FILES.items(): PROJECT.add_file(file, name)
PROJECT = Project('azkabancli_sample', root=__file__) PROJECT.properties = { 'user.to.proxy': 'production_user', 'hdfs.root': '/jobs/sample/' } # dictionary of jobs, keyed by job name JOBS = { 'gather_data': Job({ 'type': 'hadoopJava', 'job.class': 'sample.GatherData', 'path.output': '${hdfs.root}data.avro', # note the property use here }), # ... } for name, job in JOBS.items(): PROJECT.add_job(name, job) # Test project # ------------ # # This project is an exact copy of the production project which can be used # to debug / test new features independently from the production flows. TEST_PROJECT = Project('sample_test', root=__file__) TEST_PROJECT.properties = {'user.to.proxy': getuser(), 'hdfs.root': 'sample/'} PROJECT.merge_into(TEST_PROJECT)
from azkaban import Job, Project project = Project('foo') project.add_file('./jobs.py', 'jobs.py') project.add_job('bar', Job({'type': 'command', 'command': 'cat jobs.py'}))
project = Project('foo', root=__file__) defaults = { 'user.to.proxy': getuser(), 'mapred': { 'max.split.size': 2684354560, 'min.split.size': 2684354560, }, } project.add_job( 'first_pig_script', PigJob( 'path/to/first_script.pig', # assume it exists defaults, ) ) project.add_job( 'second_pig_script', PigJob( 'path/to/second_script.pig', # assume it also exists defaults, {'mapred.job.queue.name': 'special'}, ) ) project.add_job( 'final_job',
from azkaban import PigJob, Project from getpass import getuser PROJECT = Project('azkabancli_sample', root=__file__) # default options for all jobs DEFAULTS = { 'user.to.proxy': getuser(), 'param': { 'input_root': 'sample_dir/', 'n_reducers': 20, }, 'jvm.args.mapred': { 'max.split.size': 2684354560, 'min.split.size': 2684354560, }, } # list of pig job options OPTIONS = [ {'pig.script': 'first.pig'}, {'pig.script': 'second.pig', 'dependencies': 'first.pig'}, {'pig.script': 'third.pig', 'param': {'foo': 48}}, {'pig.script': 'fourth.pig', 'dependencies': 'second.pig,third.pig'}, ] for option in OPTIONS: PROJECT.add_job(option['pig.script'], PigJob(DEFAULTS, option))