def test_no_duplicated_jobs_after_file_query_direct(db, tmpdir): if not db.startswith('mysql'): db = os.path.join(str(tmpdir), db) jip.db.init(db) p = jip.Pipeline() a = p.bash('ls ${input}', input='A.txt', output='out.dat') p.context(locals()) jobs = jip.create_jobs(p, validate=False) jip.db.save(jobs) assert len(list(jip.db.get_all())) == 1 assert jobs[0].id == 1 # second pipeline p = jip.Pipeline() a = p.bash('ls ${input}', input='A.txt', output='out.dat') b = p.bash('ls ${input}', input=a) p.context(locals()) jobs = jip.create_jobs(p, validate=False) # search for the out.dat job existing = jip.db.query_by_files( outputs=jobs[1].tool.input.value ) assert len(list(existing)) == 1 old = list(existing)[0] # now replace the dependency jobs[1].dependencies = [old] # save only job 1 jip.db.save(jobs[1]) # we should have 2 jobs in the database assert len(list(jip.db.get_all())) == 2 # and the one we skipped has no ID assert jobs[0].id is None
def testPipelineStructure(self): # load the pipeline tool = jip.find("examples/bwa/pileup.jip") assert tool is not None # create a pipeline p = jip.Pipeline() # create a new pipeline node and configure id p.run(tool, input="setup.py", reference="Makefile", output="out.txt") # expand the pipeline such that the internal pipeline is resolved p.expand(validate=False) # after expansion with this setuo, the pipeline should have 7 nodes assert len(p) == 7 # the graph should consist of 6 edges assert len(p.edges) == 6 # get out the nodes. we have to use indexes here # because the names might have changed after expansion ref = p.get("ref") align = p.get("align") sam = p.get("sam") bam = p.get("bam") dups = p.get("dups") index = p.get("index") pileup = p.get("pileup") # check the connections assert not ref.has_incoming() assert align.has_incoming(ref) assert sam.has_incoming(align) assert bam.has_incoming(sam) assert dups.has_incoming(bam) assert index.has_incoming(dups) assert pileup.has_incoming(index) assert not pileup.has_outgoing()
def test_embedded_pipelines(): @jip.tool() class produce(): """Produce a set of files Usage: produce --prefix <prefix> --number <number> """ def init(self): self.add_output('output', '${prefix}.*', nargs="*") def get_command(self): return """ for x in $(seq ${number}); do echo Hello $x > ${prefix}.$x; done """ @jip.tool() def consume(): """Count something Usage: consume <input> """ return """cat ${input}""" p = jip.Pipeline() # produce n files producer = p.run('produce', prefix='test', number=5) # run after success dynamically producer.on_success('consume', input=producer) jobs = jip.create_jobs(p) assert len(jobs) == 1 assert len(jobs[0].on_success) == 1
def test_gem_pipeline_with_output_dir(): p = jip.Pipeline() p.run('grape_gem_rnapipeline', fastq='reads_1.fastq.gz', genome='index.fa', annotation='gencode.gtf', output_dir="mydir", max_matches='10', max_mismatches='4') jobs = jip.create_jobs(p, validate=False) ldir = os.getcwd() j = os.path.join assert len(jobs) == 4 assert jobs[2].configuration['index'].get() == j(ldir, 'index.gem') assert jobs[2].configuration['fastq'].get() == j(ldir, 'reads_1.fastq.gz') assert jobs[2].configuration['transcript_index'].get() == j(ldir, 'gencode.gtf.gem') assert jobs[2].configuration['quality'].get() == '33' assert jobs[2].configuration['output_dir'].get() == "mydir" assert jobs[2].configuration['name'].get() == 'reads' assert jobs[2].configuration['bam'].get() == j(ldir, 'mydir/reads.bam') assert jobs[2].configuration['bai'].get() == j(ldir, 'mydir/reads.bam.bai') assert jobs[2].configuration['map'].get() == j(ldir, 'mydir/reads.map.gz') assert jobs[3].configuration['input'].get() == j(ldir, 'mydir/reads.bam') assert jobs[3].configuration['name'].get() == 'reads' assert jobs[3].configuration['annotation'].get() == j(ldir, 'gencode.gtf') assert jobs[3].configuration['output_dir'].get() == "mydir" assert jobs[3].configuration['output'].get() == j(ldir, 'mydir/reads.gtf') assert len(jobs[0].children) == 2 assert len(jobs[1].dependencies) == 1 assert len(jobs[2].dependencies) == 2 assert len(jobs[3].dependencies) == 1 assert jobs[0].children[0] == jobs[1]
def test_nested_pipes_stream_setup_stream_multiplex(): tool = jip.find('joined_pipeline') assert tool is not None p = jip.Pipeline() p.run(tool, input=["Makefile", "README.rst", "setup.py"], output="${input}.dat") p.expand(validate=False) # 2 nodes 1 edge assert len(p) == 6 assert len(p.edges) == 3 t1_0 = p.get("TestJob1.0") t2_0 = p.get("TestJob2.0") assert t1_0.has_outgoing(t2_0, ('output', 'input'), True) t1_1 = p.get("TestJob1.1") t2_1 = p.get("TestJob2.1") assert t1_1.has_outgoing(t2_1, ('output', 'input'), True) t1_2 = p.get("TestJob1.2") t2_2 = p.get("TestJob2.2") assert t1_2.has_outgoing(t2_2, ('output', 'input'), True) # test option values cwd = os.getcwd() join = os.path.join assert t1_0.input == join(cwd, 'Makefile') assert t1_1.input == join(cwd, 'README.rst') assert t1_2.input == join(cwd, 'setup.py') assert t2_0.output == join(cwd, 'Makefile.dat') assert t2_1.output == join(cwd, 'README.rst.dat') assert t2_2.output == join(cwd, 'setup.py.dat')
def test_single_job_fail(tmpdir): tmpdir = str(tmpdir) target_file = os.path.join(tmpdir, 'result.txt') db_file = os.path.join(tmpdir, "test.db") assert not os.path.exists(target_file) # create a JIP database and a session jip.db.init(db_file) # create the cluster instance c = cl.LocalCluster() # create the pipeline p = jip.Pipeline() p.job(dir=tmpdir).bash('touch ${input}; exit 1;', outfile=target_file) p.context(locals()) # create the jobs jobs = jip.create_jobs(p) # iterate the executions and pass the session so all jobs are stored for e in jip.create_executions(jobs, save=True): jip.submit_job(e.job, cluster=c) c.wait() # now the file should be there assert not os.path.exists(target_file) # we should also have the log files assert os.path.exists(os.path.join(tmpdir, "jip-1.out")) assert os.path.exists(os.path.join(tmpdir, "jip-1.err")) # and we should have one job in Done state in our database # we do the query with a fresh session though job = jip.db.get(1) assert job is not None assert job.state == jip.db.STATE_FAILED
def test_setting_working_directory_to_sub(): cwd = os.getcwd() p = jip.Pipeline() # produce n files p.job(dir="sub").run('produce', prefix='test', number=5) jobs = jip.create_jobs(p) assert jobs[0].working_directory == cwd + "/sub"
def test_job_hierarchy_execution_with_pipes_no_dispatching(tmpdir): tmpdir = str(tmpdir) target_file = os.path.join(tmpdir, 'result') # create the pipeline p = jip.Pipeline() a = p.job(dir=tmpdir).bash('echo "hello world"') b = p.job(dir=tmpdir).bash('wc -w', output=target_file) a | b p.context(locals()) # create the jobs jobs = jip.create_jobs(p) assert len(jobs[0].pipe_to) == 1 assert len(jobs) == 2 # iterate the executions and pass the session so all jobs are stored execs = 0 for e in jip.create_executions(jobs): jip.run_job(e.job) execs += 1 assert execs == 1 # now the file should be there assert os.path.exists(target_file) for j in jobs: assert j.state == jip.db.STATE_DONE # check the content of the output files assert open(target_file).read().strip() == "2"
def pipeline(self): p = jip.Pipeline() gem_setup = p.run('grape_gem_setup', input=self.genome) gem = p.run('grape_gem_rnatool', index=gem_setup.index, fastq=self.fastq) p.context(locals()) return p
def test_job_hierarchy_job_group(tmpdir): tmpdir = str(tmpdir) target_file = os.path.join(tmpdir, 'result') @jip.tool() def merge(): """\ Merge usage: merge --input <input>... [--output <output>] Options: --input <input>... The input [default: stdin] --output <output> The input [default: stdout] """ return "cat ${input|else('-')} ${output|arg('> ')}" # create the pipeline p = jip.Pipeline() a_1 = p.job(dir=tmpdir).bash('echo "hello spain"', output=target_file + ".1") a_2 = p.job(dir=tmpdir).bash('echo "hello world"', output=target_file + ".2") a_3 = p.job(dir=tmpdir).bash('echo "hello universe"', output=target_file + ".3") b = p.job(dir=tmpdir).run('merge', output=target_file) b.input = [a_1, a_2, a_3] (a_1 - a_2 - a_3 - b) p.context(locals()) # create the jobs jobs = jip.create_jobs(p) assert len(jobs) == 4 assert len(jobs[0].dependencies) == 0 assert len(jobs[0].children) == 2 assert len(jobs[1].dependencies) == 1 assert len(jobs[1].children) == 2 assert len(jobs[2].dependencies) == 1 assert len(jobs[2].children) == 1 assert len(jobs[3].dependencies) == 3 print jobs[3].command # iterate the executions and pass the session so all jobs are stored execs = 0 for e in jip.create_executions(jobs): jip.run_job(e.job) execs += 1 assert execs == 1 # now the file should be there for j in jobs: assert j.state == jip.db.STATE_DONE # check the content of the output files assert open(target_file + '.1').read().strip() == "hello spain" assert open(target_file + '.2').read().strip() == "hello world" assert open(target_file + '.3').read().strip() == "hello universe" assert open(target_file).read().strip() == "hello spain\n"\ "hello world\nhello universe"
def test_dynamic_options(): script = '''#!/usr/bin/env jip # Touch a number of files with a common prefix # # usage: # touch --prefix <prefix> --count <count> #%begin init add_output('output') #%end #%begin setup options['output'].set(["%s_%s" % (prefix, i) for i in range(1, count.get(int) + 1)]) #%end #%begin command for x in ${output}; do touch $x done ''' tool = jip.tools.ScriptTool.from_string(script) tool.init() assert tool is not None p = jip.Pipeline() node = p.job('test').run(tool, prefix='test', count=5) assert node is not None p.expand() assert len(p) == 1 node = p.get('test') assert node.prefix == 'test' cwd = os.getcwd() assert node.output == [ os.path.join(cwd, x) for x in ['test_1', 'test_2', 'test_3', 'test_4', 'test_5'] ]
def jip_prepare(args, submit=False, project=None, datasets=[], validate=True): # get the project and the selected datasets if not project and not datasets: project, datasets = get_project_and_datasets(args) # setup jip db jip.db.init(project.jip_db) p = jip.Pipeline() jargs = {} if datasets == ['setup']: jargs['input'] = project.config.get('genome') jargs['annotation'] = project.config.get('annotation') p.run('grape_gem_setup', **jargs) jobs = jip.jobs.create_jobs(p) else: input = [] for d in datasets: fqs = d.fastq.keys() fqs.sort() input.append(fqs[0]) if len(fqs) == 1: jargs['single-end'] == True jargs['fastq'] = input jargs['annotation'] = project.config.get('annotation') jargs['genome'] = project.config.get('genome') jargs['max_mismatches'] = args.max_mismatches jargs['max_matches'] = args.max_matches jargs['threads'] = args.threads p.run('grape_gem_rnapipeline', **jargs) jobs = jip.jobs.create_jobs(p, validate=validate) if submit: jobs = check_jobs_dependencies(jobs) return jobs
def test_tool_name_with_local_context(): p = jip.Pipeline() a = p.run('foo', input='Makefile') p.context(locals()) jobs = jip.create_jobs(p, validate=False) assert len(jobs) == 1 assert jobs[0].name == 'Makefile'
def test_tool_name_in_pipeline_context(): @jip.tool() class MyTool(): def validate(self): self.job.name = "testtool" def get_command(self): return "echo" @jip.pipeline() class MyPipeline(): def validate(self): self.name("thepipeline") def pipeline(self): p = jip.Pipeline() p.run('MyTool') return p p = jip.Pipeline() p.run('MyPipeline') p.expand() jobs = jip.create_jobs(p) assert len(jobs) == 1 assert jobs[0].name == "testtool" assert jobs[0].pipeline == "thepipeline"
def pipeline(self): p = jip.Pipeline() index = p.run('grape_gem_index', input=self.input, output=self.index) p.context(locals()) return p
def test_pipeline_tool_defaults_global_job(): @jip.tool() class MyTool(): def setup(self): self.profile.threads = 2 self.profile.queue = "Org" def get_command(self): return "echo" @jip.pipeline() class MyPipeline(): def pipeline(self): p = jip.Pipeline() p.job(threads=3, queue="Intern").run('MyTool') return p p = jip.Pipeline() p.run('MyPipeline') p.expand() profile = jip.Profile(threads=5, queue="yeah") profile.specs['MyTool'] = jip.Profile() profile.apply_to_pipeline(p) jobs = jip.create_jobs(p) assert jobs[0].threads == 3 assert jobs[0].queue == "Intern"
def test_pipeline_tool_spec_regexp(): @jip.tool() class MyTool(): def get_command(self): return "echo" @jip.pipeline() class MyPipeline(): def pipeline(self): p = jip.Pipeline() p.job(threads=3, queue="Intern").run('MyTool') return p p = jip.Pipeline() p.run('MyPipeline') p.expand() profile = jip.Profile(threads=5, queue="yeah", priority="high") profile.specs['My*'] = jip.Profile(threads=10, queue="rock") profile.apply_to_pipeline(p) jobs = jip.create_jobs(p) assert jobs[0].threads == 10 assert jobs[0].queue == "rock" assert jobs[0].priority == "high"
def test_pipeline_overwrites_pipeline_from_spec(): @jip.tool() class MyTool(): def setup(self): self.profile.threads = 2 self.profile.queue = "Org" def get_command(self): return "echo" @jip.pipeline() class MyPipeline(): def pipeline(self): p = jip.Pipeline() p.job(threads=3, queue="Yeah").run('MyTool') return p p = jip.Pipeline() p.run('MyPipeline') p.expand() profile = jip.Profile(threads=10, queue="Test") profile.specs['MyTool'] = jip.Profile(threads=5) profile.apply_to_pipeline(p) jobs = jip.create_jobs(p) assert jobs[0].threads == 5 assert jobs[0].queue == "Yeah"
def test_tool_name_in_pipelines_with_multiplexing_and_custom_template_name(): @jip.tool() class MyTool(): """mytool usage: mytool <data> """ def validate(self): self.job.name = "${data}" def get_command(self): return "echo" @jip.pipeline() class MyPipeline(): def validate(self): self.name("thepipeline") def pipeline(self): p = jip.Pipeline() p.run('MyTool', data=["A", "B"]) return p p = jip.Pipeline() p.run('MyPipeline') p.expand() profile = jip.Profile(name="customname") jobs = jip.create_jobs(p, profile=profile) assert len(jobs) == 2 assert jobs[0].name == "A" assert jobs[0].pipeline == "customname" assert jobs[1].name == "B" assert jobs[1].pipeline == "customname"
def test_tool_name_in_pipeline_context_with_custom_profile_and_custom_name(): @jip.tool() class MyTool(): def validate(self): self.job.name = "testtool" def get_command(self): return "echo" @jip.pipeline() class MyPipeline(): def validate(self): self.name("thepipeline") def pipeline(self): p = jip.Pipeline() p.job('Tool1').run('MyTool') return p p = jip.Pipeline() p.run('MyPipeline') p.expand() profile = jip.Profile(name="customname") jobs = jip.create_jobs(p, profile=profile) assert len(jobs) == 1 assert jobs[0].name == "Tool1" assert jobs[0].pipeline == "customname"
def test_job_hierarchy_execution_with_dispatching_fan_out(tmpdir): tmpdir = str(tmpdir) target_file = os.path.join(tmpdir, 'result') # create the pipeline p = jip.Pipeline() a = p.job(dir=tmpdir).bash('echo "hello world"', output=target_file + ".1") b = p.job(dir=tmpdir).bash('wc -w', output=target_file + ".2") c = p.job(dir=tmpdir).bash('wc -l', output=target_file + ".3") a | (b + c) p.context(locals()) # create the jobs jobs = jip.create_jobs(p) # iterate the executions and pass the session so all jobs are stored execs = 0 for e in jip.create_executions(jobs): jip.run_job(e.job) execs += 1 assert execs == 1 # now the file should be there for j in jobs: assert j.state == jip.db.STATE_DONE # check the content of the output files assert open(target_file + '.1').read().strip() == "hello world" assert open(target_file + '.3').read().strip() == "1" assert open(target_file + '.2').read().strip() == "2"
def test_gem_name_option_delegation_with_output_dir(): p = jip.Pipeline() p.run('grape_gem_rnapipeline', fastq='reads_1.fastq.gz', index='index.gem', annotation='gencode.gtf', output_dir="mydir") jobs = jip.create_jobs(p, validate=False) ldir = os.getcwd() j = os.path.join assert len(jobs) == 2 assert jobs[0].configuration['index'].get() == j(ldir, 'index.gem') assert jobs[0].configuration['fastq'].get() == j(ldir, 'reads_1.fastq.gz') assert jobs[0].configuration['annotation'].get() == j(ldir, 'gencode.gtf') assert jobs[0].configuration['quality'].get() == '33' assert jobs[0].configuration['output_dir'].get() == "mydir" assert jobs[0].configuration['name'].get() == 'reads' assert jobs[0].configuration['bam'].get() == j(ldir, 'mydir/reads.bam') assert jobs[0].configuration['bai'].get() == j(ldir, 'mydir/reads.bam.bai') assert jobs[0].configuration['map'].get() == j(ldir, 'mydir/reads.map.gz') assert jobs[1].configuration['input'].get() == j(ldir, 'mydir/reads.bam') assert jobs[1].configuration['name'].get() == 'reads' assert jobs[1].configuration['annotation'].get() == j(ldir, 'gencode.gtf') assert jobs[1].configuration['output_dir'].get() == "mydir" assert jobs[1].configuration['gtf'].get() == j(ldir, 'mydir/reads.gtf') assert len(jobs[0].children) == 1 assert len(jobs[1].dependencies) == 1 assert jobs[0].children[0] == jobs[1]
def test_setting_working_directory_cwd_with_profile(): cwd = os.getcwd() p = jip.Pipeline() # produce n files p.run('produce', prefix='test', number=5) profile = jip.profiles.Profile() jobs = jip.create_jobs(p, profile=profile) assert jobs[0].working_directory == cwd
def test_depends_on_with_multiple_nodes(): p = jip.Pipeline() a = p.bash('hostname') b = p.bash('hostname') c = p.bash('hostname') a.depends_on(c, b) assert len(list(a.incoming())) == 2
def pipeline(self): p = jip.Pipeline() p.name("Test2") p.job("TestJob2").run('bash', cmd='cat ${input|else("-")}', input=self.options['input'], output=self.options['output']) return p
def test_hello_world_py_cls(tmpdir): tmpdir = str(tmpdir) jip.scanner.add_module('examples/hello_world/hello_world.py') jip.scanner.scan_modules() p = jip.Pipeline() p.job(dir=tmpdir).run('cls_hello_world_py') jobs = jip.create_jobs(p) assert len(jobs) == 1
def test_pipeline_with_local_context_in_expand(): p = jip.Pipeline() a = "Makefile" p.job().bash("wc -l ${a}") p.expand(locals()) b = p.get('bash') assert b is not None assert b.cmd.get() == 'wc -l Makefile'
def embedded(): """Produce and consume""" p = jip.Pipeline() # produce n files producer = p.run('produce', prefix='test', number=5) # run after success dynamically producer.on_success('consume', input=producer) return p
def test_embedded_options_are_absolute(): jip.scanner.add_folder("test/data/makeabs") p = jip.Pipeline() p.run('makeabs', infile="Makefile", output="result") jobs = jip.create_jobs(p) assert len(jobs) == 1 cwd = os.getcwd() assert jobs[0].command == "(cat %s/Makefile)> %s/result" % (cwd, cwd)
def subedge_pipe_2(tool): """Subedge usage: subedge --input <input> --output <output> """ p = jip.Pipeline() p.job('p2').bash('touch', input=tool.input, output=tool.output) return p