def test_get_assets(self): model = core.Model( tuple()) # getting away with an empty model for this step cache = self.cls_to_test(self.cache_file.name, model, force_create=True) class Activities(core.Enum): DATETIME = 'Give date/time' PythonTime = self.PythonTime python = PythonTime('python') stepvariant_db_id = cache.id_step_variant(python, (Activities.DATETIME, )) # 2-elements sources SrcCls = core.assetfactory('Source', [ core.AssetAttr('reference', rnaseq.FASTAFile, ''), core.AssetAttr('otherreference', rnaseq.FASTAFile, ''), core.AssetAttr('listoffiles', rnaseq.CSVFileSequence, '') ]) sources = SrcCls( rnaseq.FASTAFile('foo.fasta'), rnaseq.FASTAFile('bar.fasta'), rnaseq.CSVFileSequence( (rnaseq.CSVFile('baz.csv'), rnaseq.CSVFile('baz2.csv')))) targets = core.AssetSet() # targets parameters = tuple() db_id = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, parameters) for storedthing in cache.get_srcassets(db_id.id): thing = storedthing.resurrect(rnaseq)
def test_RecipeSimpleIncremental(self): project = self.project env = self.env nsamples = self.nsamples samplereads = self.samplereads sampleinfo_fh = self.sampleinfo_fh reference_fn = self.reference_fn referenceannotation = self.referenceannotation PHAGEFASTA = self._PHAGEFASTA PHAGEGFF = self._PHAGEGFF # steps used bowtie2index = env.activities.INDEX.bowtie2build bowtie2align = env.activities.ALIGN.bowtie2 htseqcount = env.activities.QUANTIFY.htseqcount merge = env.activities.UTILITY.columnmerger edger = env.activities.DIFFEXP.edger from railroadtracks import easy # sequence of tasks to run torun = list() # index for alignment Assets = bowtie2index.Assets assets = Assets(Assets.Source(rnaseq.FASTAFile(reference_fn)), Assets.Target.createundefined()) task_index = project.add_task(bowtie2index, assets) # the step is not done self.assertEqual(hortator._TASK_TODO, task_index.info[1]) torun.append(task_index) # run the tasks for task in torun: # run only if not done if task.info[1] != hortator._TASK_DONE: task.execute() task.status = hortator._TASK_DONE self.assertEqual(1, project.persistent_graph.nconcrete_steps) # now that the tasks have run let's open the same project project_same = easy.Project(project.model, wd=project.wd) # index for alignment Assets = bowtie2index.Assets assets = Assets(Assets.Source(rnaseq.FASTAFile(reference_fn)), Assets.Target.createundefined()) task_index_same = project_same.add_task(bowtie2index, assets) self.assertNotEqual(task_index, task_index_same) self.assertNotEqual(task_index.call.assets, task_index_same.call.assets) self.assertListEqual(list(task_index.call.assets.source.reference), list(task_index_same.call.assets.source.reference)) self.assertListEqual(list(task_index.call.assets.target.indexfilepattern), list(task_index_same.call.assets.target.indexfilepattern)) self.assertEqual(hortator._TASK_DONE, task_index_same.info[1]) self.assertEqual(1, project.persistent_graph.nconcrete_steps)
def setUp(self): wd = tempfile.mkdtemp() self.wd2 = tempfile.mkdtemp() self.project = easy.Project(rnaseq, wd) bowtie2index = rnaseq.Bowtie2Build() reference_fn = PHAGEFASTA Assets = bowtie2index.Assets task = self.project.add_task( bowtie2index, Assets(Assets.Source(rnaseq.FASTAFile(reference_fn)))) self.task = task
def testAddTaskDifferentProject(self): project2 = easy.Project(rnaseq, self.wd2) bowtie2index = rnaseq.Bowtie2Build() reference_fn = PHAGEFASTA Assets = bowtie2index.Assets task2 = project2.add_task( bowtie2index, Assets(Assets.Source(rnaseq.FASTAFile(reference_fn)))) # tsg = tasksetgraph.TaskSetGraph() tsg.add(self.task) self.assertRaises(ValueError, tsg.add, task2)
def test_id_stepconcrete(self): model = core.Model( tuple()) # getting away with an empty model for this step cache = self.cls_to_test(self.cache_file.name, model, force_create=True) class Activities(core.Enum): DATETIME = 'Give date/time' PythonTime = self.PythonTime python = PythonTime('python') stepvariant_db_id = cache.id_step_variant(python, (Activities.DATETIME, )) # empty sources is a special case sources = core.AssetSet() # source targets = core.AssetSet() # targets parameters = tuple() db_id = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, parameters) db_id_same = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, parameters) self.assertEqual(db_id.id, db_id_same.id) db_id_notthesame = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, parameters, tag=2) self.assertNotEqual(db_id.id, db_id_notthesame.id) # 1-element sources sources = railroadtracks.model.aligners.AssetsIndexer.Source( rnaseq.FASTAFile('foo.fasta')) db_id_nothesame = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, parameters) self.assertNotEqual(db_id.id, db_id_nothesame.id) db_id_sameagain = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, parameters) self.assertEqual(db_id_sameagain.id, db_id_nothesame.id) db_id_nothesameagain = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, ("%Y", )) self.assertNotEqual(db_id.id, db_id_nothesameagain.id) self.assertNotEqual(db_id_sameagain.id, db_id_nothesameagain.id) # 1-element sources, several parameters db_id_2params = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, ("%Y", "Z")) db_id_same2params = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, ("%Y", "Z")) self.assertEqual(db_id_2params.id, db_id_same2params.id) db_id_2otherparams = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, ("%Y", "W")) self.assertNotEqual(db_id_2params.id, db_id_2otherparams.id) # 2-elements sources SrcCls = core.assetfactory('Source', [ core.AssetAttr('reference', rnaseq.FASTAFile, ''), core.AssetAttr('otherreference', rnaseq.FASTAFile, '') ]) sources = SrcCls(rnaseq.FASTAFile('foo.fasta'), rnaseq.FASTAFile('bar.fasta')) db_id_notthesame = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, parameters) self.assertNotEqual(db_id.id, db_id_notthesame.id) db_id_sameagain = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, parameters) self.assertEqual(db_id_sameagain.id, db_id_notthesame.id) # 1-element source / 1-element target sources = railroadtracks.model.aligners.AssetsIndexer.Source( rnaseq.FASTAFile('foo.fasta')) targets = railroadtracks.model.aligners.AssetsIndexer.Target( rnaseq.FilePattern('foo_idx')) foo_sh = rnaseq.Anyscript() stepvariant_db_id = cache.id_step_variant(foo_sh, (Activities.DATETIME, )) db_id_nothesame = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, parameters) self.assertNotEqual(db_id.id, db_id_nothesame.id) db_id_sameagain = cache.id_stepconcrete(stepvariant_db_id.id, sources, targets, parameters) self.assertEqual(db_id_sameagain.id, db_id_nothesame.id) # fail if target assets are suddenly different targets_bar = railroadtracks.model.aligners.AssetsIndexer.Target( rnaseq.FilePattern('bar_idx')) self.assertRaises(ValueError, cache.id_stepconcrete, stepvariant_db_id.id, sources, targets_bar, parameters)
def test_RecipeLoop(self): project = self.project env = self.env nsamples = self.nsamples samplereads = self.samplereads sampleinfo_fh = self.sampleinfo_fh reference_fn = self.reference_fn referenceannotation = self.referenceannotation PHAGEFASTA = self._PHAGEFASTA PHAGEGFF = self._PHAGEGFF # -- recipeloop-test-begin from railroadtracks import easy torun = list() # bowtie bowtie1index = env.activities.INDEX.bowtiebuild bowtie1align = env.activities.ALIGN.bowtie Assets = bowtie1index.Assets fa_file = rnaseq.FASTAFile(reference_fn) task_index_bowtie1 = project.add_task(bowtie1index, Assets(Assets.Source(fa_file), None)) torun.append(task_index_bowtie1) # bowtie2 bowtie2index = env.activities.INDEX.bowtie2build bowtie2align = env.activities.ALIGN.bowtie2 Assets = bowtie2index.Assets fa_file = rnaseq.FASTAFile(reference_fn) task_index_bowtie2 = project.add_task(bowtie2index, Assets(Assets.Source(fa_file), None)) torun.append(task_index_bowtie2) # STAR starindex = env.activities.INDEX.starindex staralign = env.activities.ALIGN.staralign Assets = starindex.Assets fa_file = rnaseq.FASTAFile(reference_fn) task_index_star = project.add_task(starindex, Assets(Assets.Source(fa_file), None)) torun.append(task_index_star) # TopHat2 # (index from bowtie2 used) #tophat2 = env.activities.ALIGN.tophat2 # featureCount featurecount = env.activities.QUANTIFY.featurecount # Merge columns (obtained from counting) merge = env.activities.UTILITY.columnmerger # EdgeR, DESeq, DESeq2, and LIMMA voom edger = env.activities.DIFFEXP.edger deseq = env.activities.DIFFEXP.deseq deseq2 = env.activities.DIFFEXP.deseq2 voom = env.activities.DIFFEXP.limmavoom # Now explore the different alignment presets in bowtie2, and vanilla star from itertools import cycle from collections import namedtuple Options = namedtuple('Options', 'aligner assets_index parameters') # Try various presets for bowtie2 bowtie2_parameters = (('--very-fast', ), ('--fast', ), ('--sensitive', ), ('--very-sensitive', )) options = [Options(*x) for x in zip(cycle((bowtie2align,)), cycle((task_index_bowtie2.call.assets.target,)), bowtie2_parameters)] # add bowtie options.append(Options(bowtie1align, task_index_bowtie1.call.assets.target, tuple())) # add STAR (vanilla, no specific options beside the size of index k-mers) options.append(Options(staralign, task_index_star.call.assets.target, ('--genomeChrBinNbits', '12'))) # add TopHat2 #options.append(Options(tophat2, task_index_bowtie2.call.assets.target, tuple())) # loop over the options for option in options: sample_counts = list() # loop over the samples for sample_i in range(nsamples): read1_fh, read2_fh = samplereads[sample_i] # align Assets = option.aligner.Assets assets = Assets(Assets.Source(option.assets_index.indexfilepattern, rnaseq.FASTQPossiblyGzipCompressed(read1_fh.name), rnaseq.FASTQPossiblyGzipCompressed(read2_fh.name)), Assets.Target.createundefined()) task_align = project.add_task(option.aligner, assets, parameters=option.parameters) torun.append(task_align) # quantify # (non-default parameters to fit our demo GFF) Assets = featurecount.Assets assets = Assets(Assets.Source(task_align.call.assets.target.alignment, rnaseq.GFFFile(referenceannotation)), Assets.Target.createundefined()) task_quantify = project.add_task(featurecount, assets, parameters = ('--gtf-featuretype', 'CDS', '--gtf-attrtype', 'ID')) torun.append(task_quantify) # keep a pointer to the counts, as we will use it in the merge step sample_counts.append(task_quantify.call.assets) # merge the sample data into a table (so differential expression can be computed) Assets = merge.Assets source = Assets.Source(rnaseq.CSVFileSequence(tuple(x.target.counts\ for x in sample_counts))) assets_merge = Assets(source, Assets.Target.createundefined()) task_merge = project.add_task(merge, assets_merge, parameters=("0","1")) torun.append(task_merge) # differential expression with edgeR, deseq2, and voom # (deseq is too whimsical for tests) for diffexp, params in ((edger, ()), (deseq, ('--dispersion-fittype=local', )), (deseq2, ()), (voom, ())): Assets = diffexp.Assets assets = Assets(Assets.Source(task_merge.call.assets.target.counts, core.File(sampleinfo_fh.name)), Assets.Target.createundefined()) task_de = project.add_task(diffexp,assets) torun.append(task_de) # run the tasks # (this is an integration test rather than a unit test - the # 3rd-party tools are often brittle and we want to keep the noise level down) env_log_level = environment.logger.level environment.logger.level = logging.ERROR try: for task in torun: if task.info[1] != hortator._TASK_DONE: try: task.execute() status = easy.hortator._TASK_DONE except: status = easy.hortator._TASK_FAILED project.persistent_graph.step_concrete_state(hortator.DbID(task.task_id, False), easy.hortator._TASK_STATUS_LIST[status]) finally: environment.logger.level = env_log_level
def test_RecipeSimple(self): project = self.project env = self.env nsamples = self.nsamples samplereads = self.samplereads sampleinfo_fh = self.sampleinfo_fh reference_fn = self.reference_fn referenceannotation = self.referenceannotation PHAGEFASTA = self._PHAGEFASTA PHAGEGFF = self._PHAGEGFF # -- recipesimple-test-begin # steps used bowtie2index = env.activities.INDEX.bowtie2build bowtie2align = env.activities.ALIGN.bowtie2 htseqcount = env.activities.QUANTIFY.htseqcount merge = env.activities.UTILITY.columnmerger edger = env.activities.DIFFEXP.edger from railroadtracks import easy # sequence of tasks to run torun = list() # index for alignment Assets = bowtie2index.Assets assets = Assets(Assets.Source(rnaseq.FASTAFile(reference_fn)), Assets.Target.createundefined()) task_index = project.add_task(bowtie2index, assets) torun.append(task_index) # process all samples sample_counts = list() for read1_fh, read2_fh in samplereads: # align Assets = bowtie2align.Assets assets = Assets(Assets.Source(task_index.call.assets.target.indexfilepattern, rnaseq.FASTQPossiblyGzipCompressed(read1_fh.name), rnaseq.FASTQPossiblyGzipCompressed(read2_fh.name)), Assets.Target.createundefined()) task_align = project.add_task(bowtie2align, assets) torun.append(task_align) # quantify # (non-default parameters to fit our demo GFF) params = rnaseq.HTSeqCount._noexons_parameters Assets = htseqcount.Assets assets = Assets(Assets.Source(task_align.call.assets.target.alignment, rnaseq.GFFFile(referenceannotation)), Assets.Target.createundefined()) task_quantify = project.add_task(htseqcount, assets, parameters=params) torun.append(task_quantify) # keep a pointer to the counts, # as we will use them in the merge step sample_counts.append(task_quantify.call.assets) # merge the sample data into a table # (so differential expression can be computed) Assets = merge.Assets counts = tuple(x.target.counts for x in sample_counts) assets = Assets(Assets.Source(rnaseq.CSVFileSequence(counts)), merge.Assets.Target.createundefined()) task_merge = project.add_task(merge, assets, parameters=("0","1")) torun.append(task_merge) # differential expression with edgeR Assets = edger.Assets assets = Assets(Assets.Source(task_merge.call.assets.target.counts, rnaseq.CSVFile(sampleinfo_fh.name)), Assets.Target.createundefined()) task_de = project.add_task(edger, assets) # run the tasks for task in torun: # run only if not done if task.info[1] != hortator._TASK_DONE: task.execute() # get results final_storedentities = project.get_targetsofactivity(rnaseq.ACTIVITY.DIFFEXP) # get the step that created the results files final_steps = list() for stored_entity in final_storedentities: final_steps.append(project.persistent_graph.get_parenttask_of_storedentity(stored_entity)) # -- recipesimple-test-end self.assertEqual(1, len(final_storedentities)) self.assertEqual(core.File.__name__, final_storedentities[0].clsname) self.assertEqual('railroadtracks.model.diffexp.EdgeR', final_steps[0].clsname)
def _recipesimpleincremental(self, runtasks): project = self.project env = self.env nsamples = self.nsamples samplereads = self.samplereads sampleinfo_fh = self.sampleinfo_fh reference_fn = self.reference_fn referenceannotation = self.referenceannotation PHAGEFASTA = self._PHAGEFASTA PHAGEGFF = self._PHAGEGFF # steps used bowtie2index = env.activities.INDEX.bowtie2build bowtie2align = env.activities.ALIGN.bowtie2 htseqcount = env.activities.QUANTIFY.htseqcount merge = env.activities.UTILITY.columnmerger edger = env.activities.DIFFEXP.edger for iteration in range(5): nextiteration = False # sequence of tasks to run torun = list() # index for alignment Assets = bowtie2index.Assets assets = Assets(Assets.Source(rnaseq.FASTAFile(reference_fn)), Assets.Target.createundefined()) task_index = project.add_task(bowtie2index, assets) torun.append(task_index) if iteration < 1: nextiteration = True runtasks(torun) self.assertEqual(1, project.persistent_graph.nconcrete_steps) continue # process all samples sample_counts = list() for sample_i, (read1_fh, read2_fh) in enumerate(samplereads): # align Assets = bowtie2align.Assets assets = Assets(Assets.Source(task_index.call.assets.target.indexfilepattern, rnaseq.FASTQPossiblyGzipCompressed(read1_fh.name), rnaseq.FASTQPossiblyGzipCompressed(read2_fh.name)), Assets.Target.createundefined()) task_align = project.add_task(bowtie2align, assets) torun.append(task_align) if iteration < 2: nextiteration = True runtasks(torun) self.assertEqual(1+(sample_i+1), project.persistent_graph.nconcrete_steps) continue # quantify # (non-default parameters to fit our demo GFF) params = rnaseq.HTSeqCount._noexons_parameters Assets = htseqcount.Assets assets = Assets(Assets.Source(task_align.call.assets.target.alignment, rnaseq.GFFFile(referenceannotation)), Assets.Target.createundefined()) task_quantify = project.add_task(htseqcount, assets, parameters=params) torun.append(task_quantify) if iteration < 3: nextiteration = True runtasks(torun) self.assertEqual(1+len(samplereads)+(sample_i+1), project.persistent_graph.nconcrete_steps) continue # keep a pointer to the counts, as we will use it in the merge step sample_counts.append(task_quantify.call.assets) if nextiteration: continue # merge the sample data into a table (so differential expression can be computed) Assets = merge.Assets counts = tuple(x.target.counts for x in sample_counts) assets = Assets(Assets.Source(rnaseq.CSVFileSequence(counts)), merge.Assets.Target.createundefined()) task_merge = project.add_task(merge, assets, parameters=("0", "1")) torun.append(task_merge) if iteration < 4: nextiteration = True runtasks(torun) self.assertEqual(1+2*len(samplereads)+1, project.persistent_graph.nconcrete_steps) continue # differential expression with edgeR Assets = edger.Assets assets = Assets(Assets.Source(task_merge.call.assets.target.counts, rnaseq.CSVFile(sampleinfo_fh.name)), Assets.Target.createundefined()) task_de = project.add_task(edger, assets) if iteration < 5: nextiteration = True runtasks(torun) self.assertEqual(1+2*len(samplereads)+2, # 1 index + 2 FASTQ per sample + 1 merge + 1 differential expression project.persistent_graph.nconcrete_steps) continue