The pipeline is configured by an options file in YAML format, including the actual commands which are run at each stage. ''' from ruffus import * import os.path import shutil from utils import (runStage, splitPath, getOptions, initLog, getCommand) from chrom_info import (chromInfo) import sys import glob # Read the configuation options from file, determine the reference file # and list of sequence files. options = getOptions() reference = options['reference'] #sequences = options['sequences'] sequencePatterns = options['sequences'] sequences = [] if type(sequencePatterns) == list: for pattern in sequencePatterns: sequences.append(glob.glob(pattern)) else: sequences = glob.glob(sequencePatterns) # Start the logging process. logger = initLog(options) # Get information about chromosomes in the reference file chromosomes = chromInfo(reference) # Index the reference file.
datas = utils.getDataInFile(dataFileName) querys = utils.readDataFromFile(queryFileName) vps = utils.readDataFromFile(vpFileName) print len(datas), len(querys), len(vps) cdsDatas = [] for i in xrange(len(datas)): t = [] for j in xrange(len(vps)): t.append(utils.hammingDistance(datas[i], vps[j])) cdsDatas.append(t) utils.writeDataToFile(cdsDataFileName, cdsDatas) cdsQuerys = [] for i in xrange(len(querys)): t = [] for j in xrange(len(vps)): t.append(utils.hammingDistance(querys[i], vps[j])) cdsQuerys.append(t) utils.writeDataToFile(cdsQueryFileName, cdsQuerys) print cdsDataFileName, cdsQueryFileName if __name__ == '__main__': utils.createDirectory('cds_data') utils.createDirectory('cds_query') options = utils.getOptions() convertNDDSToCDS(options)
getCornerPoints(0,[],dim,alphabet,cardinality) vps = [] vps.append([ 'A' for i in xrange(dim) ]) #vps.append([ 'B' for i in xrange(dim) ]) #vps.append([ 'C' for i in xrange(dim) ]) #vps.append([ 'D' for i in xrange(dim) ]) for i in xrange(numberOfVP - 1): print i #mx, mx_idx = (0,0) mn, mn_idx = (987654321.0,0) for j in xrange(len(cornerPoints)): variation = getTotalCostFunction(vps,cornerPoints[j],base) if mn > variation: mn, mn_idx = (variation,j) #if mx < dist: # mx, mx_idx = (dist,j) print mn,mn_idx,cornerPoints[mn_idx] vps.append(cornerPoints[mn_idx]) utils.writeDataToFile('vp/vp_%d_%d_%d_greedy.txt'%(dim,numberOfVP,cardinality),vps) if __name__ == '__main__': options = utils.getOptions() utils.createDirectory('vp') #generateVantagePoints(options) #generateGreedyVantagePoints(options) #generateAllRandomVantagePoints(options) generateHeuristicVantagePoints(options)
It supports parallel evaluation of independent pipeline stages, and can run stages on a cluster environment. The pipeline is configured by an options file in YAML format, including the actual commands which are run at each stage. ''' from ruffus import * import os.path import shutil from utils import (runStage, splitPath, getOptions, initLog, getCommand) # Read the configuation options from file, determine the reference file # and list of sequence files. options = getOptions() reference = options['reference'] sequences = options['sequences'] isPairedEnd = options['pipeline']['paired'] # Start the logging process. logger = initLog(options) # Index the reference file. @files(reference, reference + '.bwt', logger) def mkRefDataBase(reference, output, logger): runStage('mkRefDataBase', logger, options, reference, output) # Index the reference file. # XXX not sure why we need to do both mkRefDataBase and indexReference.
Description: Simple pipeline to demonstrate how to use the base tools. Counts the number of lines in a set of files and then sums them up. ''' import sys from ruffus import * from utils import (runStageCheck, getOptions, initLog) from cmdline_args import get_cmdline_args args = get_cmdline_args() options = getOptions(args) logDir = options.pipeline['logDir'] logger = initLog(options) # the input files data_files = ['test_data/data1.txt', 'test_data/data2.txt'] # count the number of lines in a file @transform(data_files, suffix('.txt'), ['.count', '.count.Success']) def countLines(file, outputs): output,flagFile = outputs runStageCheck('countLines', flagFile, logger, options, file, output) # sum the counts from the previous stage @merge(countLines, ['test_data/total.txt', 'test_data/total.Success']) def total(files, outputs):
def main(): args = get_cmdline_args() # We want to look for modules in the directory local to the pipeline, # just as if the pipeline script had been called directly. # This includes the script itself and the config files imported by getOptions sys.path.insert(0, os.path.dirname(args.pipeline)) # options must be set before pipeline is imported options = getOptions(args) setOptions(options) # import the pipeline so its stages are defined # the name of the pipeline is given on the command line __import__(drop_py_suffix(args.pipeline)) logDir = options.pipeline['logDir'] startLogger() pipelineOptions = options.pipeline endTasks = pipelineOptions['end'] forcedTasks = pipelineOptions['force'] style = pipelineOptions['style'] if pipelineOptions['rebuild'] == 'fromstart': rebuildMode = True elif pipelineOptions['rebuild'] == 'fromend': rebuildMode = False else: rebuildMode = True if style in ['run', 'touchfiles']: touchfiles_flag = (style == 'touchfiles') # Perform the pipeline steps (run the pipeline). pipeline_run( # End points of the pipeline. endTasks, # How many ruffus tasks to run. multiprocess=pipelineOptions['procs'], logger=black_hole_logger, # Force the pipeline to start from here, regardless of whether the # stage is up-to-date or not. forcedtorun_tasks=forcedTasks, # If the style was touchfiles, we will set a flag to bring # files up to date without running anything touch_files_only=touchfiles_flag, # Choose the mode in which ruffus decides how much work needs to be # done. gnu_make_maximal_rebuild_mode=rebuildMode) elif style == 'flowchart': # Draw the pipeline as a diagram. pipeline_printout_graph('flowchart.svg', 'svg', endTasks, no_key_legend=False) elif style == 'print': # Print a textual description of what the piplines would do, #but don't actuall run it. pipeline_printout(sys.stdout, endTasks, verbose=5, wrap_width=100000, forcedtorun_tasks=forcedTasks, gnu_make_maximal_rebuild_mode=rebuildMode)
Description: Simple pipeline to demonstrate how to use the base tools. Counts the number of lines in a set of files and then sums them up. ''' import sys from ruffus import * from utils import (runStageCheck, getOptions, initLog) from cmdline_args import get_cmdline_args args = get_cmdline_args() options = getOptions(args) logDir = options.pipeline['logDir'] logger = initLog(options) # the input files data_files = ['test_data/data1.txt', 'test_data/data2.txt'] # count the number of lines in a file @transform(data_files, suffix('.txt'), ['.count', '.count.Success']) def countLines(file, outputs): output, flagFile = outputs runStageCheck('countLines', flagFile, logger, options, file, output) # sum the counts from the previous stage
def main(): args = get_cmdline_args() # We want to look for modules in the directory local to the pipeline, # just as if the pipeline script had been called directly. # This includes the script itself and the config files imported by getOptions sys.path.insert(0, os.path.dirname(args.pipeline)) # options must be set before pipeline is imported options = getOptions(args) setOptions(options) # import the pipeline so its stages are defined # the name of the pipeline is given on the command line __import__(drop_py_suffix(args.pipeline)) logDir = options.pipeline['logDir'] startLogger() pipelineOptions = options.pipeline endTasks = pipelineOptions['end'] forcedTasks = pipelineOptions['force'] style = pipelineOptions['style'] if pipelineOptions['rebuild'] == 'fromstart': rebuildMode = True elif pipelineOptions['rebuild'] == 'fromend': rebuildMode = False else: rebuildMode = True if style in ['run', 'touchfiles']: touchfiles_flag = (style=='touchfiles') # Perform the pipeline steps (run the pipeline). pipeline_run( # End points of the pipeline. endTasks, # How many ruffus tasks to run. multiprocess=pipelineOptions['procs'], logger=black_hole_logger, # Force the pipeline to start from here, regardless of whether the # stage is up-to-date or not. forcedtorun_tasks=forcedTasks, # If the style was touchfiles, we will set a flag to bring # files up to date without running anything touch_files_only=touchfiles_flag, # Choose the mode in which ruffus decides how much work needs to be # done. gnu_make_maximal_rebuild_mode=rebuildMode) elif style == 'flowchart': # Draw the pipeline as a diagram. pipeline_printout_graph( 'flowchart.svg', 'svg', endTasks, no_key_legend=False) elif style == 'print': # Print a textual description of what the piplines would do, #but don't actuall run it. pipeline_printout( sys.stdout, endTasks, verbose=5, wrap_width=100000, forcedtorun_tasks=forcedTasks, gnu_make_maximal_rebuild_mode=rebuildMode)