def get_shell_command(self, task): ''' Get the command line from the module file and format it with proper snakemake wildcard notation.''' try: cmd = utils.get_command_from_module(self.W.modules[task]) except ConfigError as e: raise ConfigError('While parssing the task file: "%s", we bumpped \ into the following problem: "%s".' % (task, e)) param_dict_for_cmdline = self.get_param_dict_for_cmdline(task) try: cmd = cmd.format(**param_dict_for_cmdline) except KeyError as e: raise ConfigError( 'Something went wrong while parsing task file %s. \ The following keyword is causing trouble: %s' % (task, e)) if '>' not in cmd: # this is kind of hacky # but if the command does not involve piping output # then we want to pipe output to the log file cmd = cmd + ' >> {log} 2>&1' else: # if the commnad includes piping output # then we just redirect stderr to the log cmd = cmd + ' 2>{log}' return (cmd)
def init(self, workflow_name): ''' if a regular instance of workflow object is being generated, we expect it to have a parameter `args`. if there is no `args` given, we assume the class is being inherited as a base class from within another. For a regular instance of a workflow this function will set the args and init the WorkflowSuperClass. ''' self.name = workflow_name self.ROOT_DIR = self.config.get('ROOT_DIR', os.path.join(os.getcwd(), "Flow")) self.load_pairs_table() self.tasks = self.get_tasks_dict() if not self.config: raise ConfigError( 'You need to provide a config file to run this workflow.') self.config_sanity_checks() self.add_task_definitions(workflow_name) # if the user did not specify a directory then use current directory and put everything under "Flow" directory self.populate_dirs_dict() os.makedirs(self.dirs_dict['LOGS_DIR'], exist_ok=True) os.makedirs(self.ROOT_DIR, exist_ok=True)
def get_command_from_module(deploy_path): if not filesnpaths.is_file_exists(deploy_path, dont_raise=True): raise ConfigError( 'The following module file/folder is missing: "%s".' % deploy_path) tmpfile = save_command_from_module_to_TXT_file(deploy_path) cmd = open(tmpfile).read().strip() return (cmd)
def load_pairs_table(self): pairs_rds = self.config.get('pairs_rds') if not pairs_rds: raise ConfigError( 'You must specify a path to a pairs rds file in your config file.' ) if not filesnpaths.is_file_exists(pairs_rds, dont_raise=True): raise ConfigError( 'The pairs rds file path that was provided does not exist: %s' % pairs_rds) self.pairs = pd.read_csv( utils.save_pairs_table_as_TAB_delimited(pairs_rds), sep='\t', index_col=0)
def check_input_params(self): ''' check if two tasks have the same input parameter pointed to different columns''' for iparam in self.input_param_dict: if len(self.input_param_dict[iparam]) > 1: task_iter = iter(self.input_param_dict[iparam]) task1 = next(task_iter) column_name1 = self.param_dataframes[task1].loc[ iparam, 'param_name_in_pairs_table'] mismatch = [ (t, self.param_dataframes[t].loc[iparam, 'param_name_in_pairs_table']) for t in task_iter if self.param_dataframes[t].loc[ iparam, 'param_name_in_pairs_table'] != column_name1 ] if mismatch: raise ConfigError( 'Task files with the same parameters must \ also point to the same column in the pairs \ table, yet there are two or more tasks \ with identical parameters that point to \ different columns in the pairs table. For \ example: the input parameter %s is found in \ tasks %s, %s, but pointing to columns %s, %s, \ respectively.' % (iparam, task1, mismatch[0][0], column_name1, mismatch[0][1]))
def load_param_table_from_task_file(task_file): ''' Load the parameters from the task file as a data frame''' f = open(os.path.abspath(task_file)).read().splitlines() # removing trailing spaces, skipping commented lines, and getting rid of empty lines f = [s.strip() for s in f if not s.startswith('#') and len(s.strip()) > 0] # skipping the line in which the module is mentioned f = f[1:] # converting sequences of spaces and tabs to tabs task_lines = [re.sub('[\s\t]{2,}', '\t', s) for s in f] col_names = get_task_column_names() d = pd.DataFrame(index=range(len(task_lines)), columns=col_names) for i in d.index: cols = col_names.copy() cols.reverse() task_file_columns = task_lines[i].split('\t') if len(task_file_columns) > len(cols): raise ConfigError( 'The task file should only have up to %s columns, \ but on of the task files you provided ("%s") has \ %s columns.' % (len(cols), task_file, len(task_file_columns))) for s in task_file_columns: c = cols.pop() d.loc[i, c] = s # set the param name as the index: d.set_index('param', inplace=True) return d
def __init__(self, args): A = lambda x: self.args.__dict__[x ] if x in self.args.__dict__ else None self.args = args self.config = A('config') self.config_file = A('config_file') self.threads = {} self.target_files = [] self.input_param_dict = { } # dictionary to connect input parameters to tasks self.output_param_dict = { } # dictionary to connect output parameters to tasks self.io_dict = { } # dictionary with output parameters and list of tuples of matching inputs with the format (task, input) self.param_dataframes = {} self.pairs = None self.tasks = {} self.modules = {} self.dirs_dict = {"LOGS_DIR": "00_LOGS"} self.params = {} self.subworkflows = [] if not self.config and not self.config_file: raise ConfigError('You must provide a path to a config file.') if not self.config: filesnpaths.is_file_json_formatted(self.config_file) self.config = json.load(open(self.config_file))
def get_tasks_dict(self): ''' Returns the task dictionary by reading the config file. The input in the config file could be either: 1. A list of paths to task files. 2. A single task file. 3. a path to a directory. An empty dictionary is returned by default. ''' tasks = {} task_list = self.config.get('tasks', []) if type(task_list) is not list: if type(task_list) is not str: raise ConfigError('"tasks" must be provided as a list or as a \ single string in the config file, but you \ provided a "%s"' % type(task_list)) # check if directory if os.path.isdir(task_list): # get all the *.task files from the directory import glob task_list = glob.glob(os.path.join(task_list, '*.task')) else: # a single path was provided so let's convert to list task_list = [task_list] task_file_with_bad_suffix = [ t for t in task_list if not t.endswith('.task') ] if task_file_with_bad_suffix: raise ConfigError( 'Task files must have suffix ".task", but some of \ the task files you provided don\'t. For example: \ %s' % task_file_with_bad_suffix[0]) for t in task_list: # make sure task file exist filesnpaths.is_file_exists(t) tasks = dict([(utils.fix_name(os.path.basename(t)[:-5]), t) for t in task_list]) return (tasks)
def get_rule_param(self, task, param, wildcards): param_value = '' if param not in self.param_dataframes[task].index: task_file = self.tasks[task] raise ConfigError('Someone is requesting a parameter that is not \ defined in the task file. Here are the details: \ The parameter %s was requested for %s, but it \ is not listed in the task file: %s' % (param, task, task_file)) param_column_name = self.get_param_name_from_task_file(task, param) if utils.is_param_a_literal(param_column_name): # if it is a literal then we simply return the literal value param_value = utils.fix_param(param_column_name) elif param_column_name in self.pairs.columns: # if there is such a column already in the pairs table then we read the value from there param_value = self.pairs.loc[wildcards.pair, param_column_name] if pd.isna(param_value): param_value = '' elif param_column_name == self.pairs.index.name: # the parameter is the key parameter (usually "pair") param_value = wildcards.pair if not param_value: # get the default value from the task file param_value = self.get_default_value_from_task_file(task, param) if not param_value: if param not in self.output_param_dict: raise ConfigError( 'The following parameter is missing: "%s" from \ the pairs table for the pair id: "%s". you must \ either populate the pairs table or provide a \ default value in the %s task file.' % (param, wildcards.pair, task)) if param_value and (self.get_param_type_from_task_file(task, param) == 'path'): param_value = utils.fix_path(param_value) return (param_value)
def run_command(cmdline, log_file_path, first_line_of_log_is_cmdline=True, remove_log_file_if_exists=True, silent=False): """Uses subprocess.call to run your `cmdline`""" cmdline = format_cmdline(cmdline) filesnpaths.is_output_file_writable(log_file_path) if remove_log_file_if_exists and os.path.exists(log_file_path): os.remove(log_file_path) try: if first_line_of_log_is_cmdline: with open(log_file_path, "a") as log_file: log_file.write('# DATE: %s\n# CMD LINE: %s\n' % (get_date(), ' '.join(cmdline))) log_file = open(log_file_path, 'a') if not silent: print('Running the command: "%s". Log file: %s' % (' '.join(cmdline), log_file_path)) ret_val = subprocess.call(cmdline, shell=False, stdout=log_file, stderr=subprocess.STDOUT) log_file.close() if ret_val < 0: raise ConfigError( "command was terminated. There could be a hint here: %s." % log_file_path) else: return ret_val except OSError as e: raise ConfigError( "command was failed for the following reason: '%s' ('%s')" % (e, cmdline))
def add_task_definitions(self, workflow_name): ''' Iterate through tasks to populate input and output definitions and module paths''' if not self.tasks: raise ConfigError( 'You must include at least one task in your config file.') for task in self.tasks: self.read_task_file(task) #self.check_input_params() self.populate_io_dict() self.update_defaults_using_output_parameters() self.update_targets()
def format_cmdline(cmdline): """Takes a cmdline for `run_command` or `run_command_STDIN`, and makes it beautiful.""" if not cmdline or (not isinstance(cmdline, str) and not isinstance(cmdline, list)): raise ConfigError( "You made ultis::format_cmdline upset. The parameter you sent to run kinda sucks. It should be string\ or list type. Note that the parameter `shell` for subprocess.call in this `run_command` function\ is always False, therefore if you send a string type, it will be split into a list prior to being\ sent to subprocess.") if isinstance(cmdline, str): cmdline = [str(x) for x in cmdline.split(' ')] else: cmdline = [str(x) for x in cmdline] return cmdline
def is_param_a_literal(param): ''' Parameters that are surrounded by quotes are considered literals by Flow meaning they are used as is, instead of defining a column name in the pairs table ''' if None: return False if type(param) is not str: raise ConfigError( 'Parameters must be of type %s, but someone provided a parameter of type %s. \ This is the kind of error you should never encounter so you might have to contact \ one of the developers.') for q in ['"', "'"]: if param.startswith(q) and param.endswith(q) and len(param) > 1: return True return False
def get_snakefile_output_param(task, param, filename, wildcards='pair'): s = " {param} = os.path.realpath(os.path.join({task}_workflow_object.ROOT_DIR, dirs_dict['{task}'], '{wildcards}', '{filename}'))" if type(wildcards) == list: # this is a place holder in case we would want to use multiple wildcards in the future # notice that if we go down this road then we would need to also treat the log definition to contain all wildcards # as well as change get_rule_param to be compatible with such a change wildcards_str = ', '.join( ['{%s}' % wildcard for wildcard in wildcards]) elif type(wildcards) == str: wildcards_str = '{%s}' % wildcards else: raise ConfigError('Wildcards must be either a single string or a list \ of strings, but an object of type %s was provided.' % type(wildcards)) s = s.format(param=param, task=task, filename=filename, wildcards=wildcards_str) return (s)
def check_for_R_packages(required_packages): # Before we do anything let's make sure the user has R installed is_program_exists('Rscript') # Let's make sure all the required packages are installed missing_packages = [] log_file = filesnpaths.get_temp_file_path() for lib in required_packages: ret_val = run_command( ["Rscript", "-e", "library('%s')" % lib], log_file, silent=True) if ret_val != 0: missing_packages.append(lib) if missing_packages: raise ConfigError( 'The following R packages are required in order to run \ this program, but are missing: %s.' % ', '.join(missing_packages))
def is_program_exists(program, dont_raise=False): IsExe = lambda p: os.path.isfile(p) and os.access(p, os.X_OK) fpath, fname = os.path.split(program) if fpath: if IsExe(program): return program else: for path in os.environ["PATH"].split(os.pathsep): path = os.path.expanduser(path).strip('"') exe_file = os.path.join(path, program) if IsExe(exe_file): return exe_file if dont_raise: return False raise ConfigError("The following software: '%s' needs to be installed on your system, but it doesn't seem to appear\ in your path :/ If you are certain you have it on your system (for instance you can run it\ by typing '%s' in your terminal window), you may want to send a detailed bug report. Sorry!"\ % (program, program))
def read_task_file(self, task): ''' Populate the param_dataframes, input_param_dict, output_param_dict, and module path by reading the task file''' task_file = self.tasks.get(task) if not task_file: raise ConfigError( 'No task file was provided for task "%s" in your config file.' % task) try: filesnpaths.is_file_exists(task_file) except FilesNPathsError: raise ConfigError( 'The task file "%s" does not exist, and yet it was \ provided for task "%s" in your config file' % (task_file, task)) # read the entire parameter table from the task file param_dataframe = utils.load_param_table_from_task_file(task_file) param_dataframe.index = [ s.replace('.', '_') for s in param_dataframe.index ] # make sure that all input params are of "param_type" path or value bad_params = [ p for p in param_dataframe.loc[param_dataframe['io_type'] == 'input'].index if param_dataframe.loc[p, 'param_type'] not in ['path', 'value'] ] if bad_params: raise ConfigError( 'Input parameters must be defined as either "path" \ or "value", but your task file "%s" declares the \ param "%s" as "%s"' % (task_file, bad_params[0], param_dataframe.loc[bad_params[0], 'param_type'])) for iparam in param_dataframe.loc[ (param_dataframe['io_type'] == 'input') & (param_dataframe['param_type'] == 'path')].index: # check for literals and store them as defaults if utils.is_param_a_literal( param_dataframe.loc[iparam, 'param_name_in_pairs_table']): # store the literal as the default value param_dataframe.loc[iparam, 'default_value'] = param_dataframe.loc[ iparam, 'param_name_in_pairs_table'] # remove the value from the param_name_in_pairs_table column (since we dont expect such a column to exist param_dataframe.loc[iparam, 'param_name_in_pairs_table'] = None # populate param dict if self.input_param_dict.get(iparam): # such a parameter already was defined # append this task name self.input_param_dict[iparam].append(task) else: self.input_param_dict[iparam] = [task] self.param_dataframes[task] = param_dataframe for oparam in param_dataframe.loc[param_dataframe['io_type'] == 'output'].index: # populate param dict if self.output_param_dict.get(oparam): # such a parameter already was defined # in the future we might allow this, but for now we will raise an error raise ConfigError( 'An output parameter can only be defined once \ for a single task, yet two of your task files \ ("%s" and "%s") define the same output: "%s"\ ' % (task, self.output_param_dict[oparam], oparam)) else: self.output_param_dict[oparam] = task # get the module dir path self.modules[task] = utils.get_module_path_from_task_file(task_file)
def __init__(self, args): A = lambda x: self.args.__dict__[x ] if x in self.args.__dict__ else None self.args = args self.config = A('config') self.config_file = A('config_file') self.name = A('name') self.output_dir = A('output_dir') self.W = WorkflowSuperClass( argparse.Namespace(config_file=self.config_file)) self.W.init(self.name) # in the future we might turn this to a list of wildcards wildcard = '{pair}' if os.path.isdir(self.output_dir): raise ConfigError( 'There is already an output directory %s. We don\'t \ like overwriting stuff.' % self.output_dir) # get the template with open(get_path_to_snakefile_template()) as f: template = f.read() allparams = {} for task in self.W.param_dataframes: d = self.W.param_dataframes[task] params = [] for param in d.loc[(d['io_type'] == 'input') & (d['param_type'] == 'value')].index: # iterate through non-file inputs (AKA params) params.append(get_snakefile_param_definition(task, param)) param_str = ',\n'.join(params) if param_str: # we need to add an extra comma at the end of the params that we are adding, # because the "output_dir" and "module_path" params are included in the template. param_str = param_str + ',' inputs = [] for param in d.loc[(d['io_type'] == 'input') & (d['param_type'] == 'path')].index: # iterate through "path" inputs (AKA input files) inputs.append(get_snakefile_param_definition(task, param)) input_str = ',\n'.join(inputs) if input_str: input_str = ' input:\n' + input_str outputs = [] for param, row in d.loc[d['io_type'] == 'output'].iterrows(): # iterate through outputs (AKA outputs) filename = row['param_name_in_pairs_table'] filename = utils.fix_param(filename) outputs.append( get_snakefile_output_param(task, param, filename)) output_str = ',\n'.join(outputs) run_cmd = self.get_shell_command(task) format_dict = { 'task': task, 'inputs': input_str, 'outputs': output_str, 'task_params': param_str, 'run_cmd': run_cmd, 'wildcard': wildcard } snakefile = template.format(**format_dict) snakefile = snakefile.replace('<libdir>', '{params.module_path}/') snakefile_dir = os.path.join(self.output_dir, task) os.makedirs(snakefile_dir, exist_ok=True) snakefile_path = os.path.join(snakefile_dir, 'Snakefile') with open(snakefile_path, 'w') as f: f.write(snakefile) # create the main snakefile with open(get_path_to_main_snakefile_template()) as f: main_template = f.read() include_cmd = '' for task in self.W.tasks: include_cmd = include_cmd + 'include: "' + utils.fix_path( os.path.join(self.output_dir, task, 'Snakefile')) + '"' + '\n' main_snakefile = main_template.format(name=self.name, include_cmd=include_cmd) snakefile_path = os.path.join(self.output_dir, 'Snakefile') print('Writing the main Snakefile for workflow "%s" to: %s' % (self.name, snakefile_path)) with open(snakefile_path, 'w') as f: f.write(main_snakefile)