Beispiel #1
0
    def get_shell_command(self, task):
        ''' Get the command line from the module file and format it with proper snakemake wildcard notation.'''
        try:
            cmd = utils.get_command_from_module(self.W.modules[task])
        except ConfigError as e:
            raise ConfigError('While parssing the task file: "%s", we bumpped \
                               into the following problem: "%s".' % (task, e))

        param_dict_for_cmdline = self.get_param_dict_for_cmdline(task)
        try:
            cmd = cmd.format(**param_dict_for_cmdline)
        except KeyError as e:
            raise ConfigError(
                'Something went wrong while parsing task file %s. \
                               The following keyword is causing trouble: %s' %
                (task, e))
        if '>' not in cmd:
            # this is kind of hacky
            # but if the command does not involve piping output
            # then we want to pipe output to the log file
            cmd = cmd + ' >> {log} 2>&1'
        else:
            # if the commnad includes piping output
            # then we just redirect stderr to the log
            cmd = cmd + ' 2>{log}'
        return (cmd)
Beispiel #2
0
    def init(self, workflow_name):
        '''
            if a regular instance of workflow object is being generated, we
            expect it to have a parameter `args`. if there is no `args` given, we
            assume the class is being inherited as a base class from within another.

            For a regular instance of a workflow this function will set the args
            and init the WorkflowSuperClass.
        '''
        self.name = workflow_name
        self.ROOT_DIR = self.config.get('ROOT_DIR',
                                        os.path.join(os.getcwd(), "Flow"))
        self.load_pairs_table()

        self.tasks = self.get_tasks_dict()

        if not self.config:
            raise ConfigError(
                'You need to provide a config file to run this workflow.')

        self.config_sanity_checks()
        self.add_task_definitions(workflow_name)
        # if the user did not specify a directory then use current directory and put everything under "Flow" directory
        self.populate_dirs_dict()
        os.makedirs(self.dirs_dict['LOGS_DIR'], exist_ok=True)
        os.makedirs(self.ROOT_DIR, exist_ok=True)
Beispiel #3
0
def get_command_from_module(deploy_path):
    if not filesnpaths.is_file_exists(deploy_path, dont_raise=True):
        raise ConfigError(
            'The following module file/folder is missing: "%s".' % deploy_path)
    tmpfile = save_command_from_module_to_TXT_file(deploy_path)
    cmd = open(tmpfile).read().strip()
    return (cmd)
Beispiel #4
0
    def load_pairs_table(self):
        pairs_rds = self.config.get('pairs_rds')
        if not pairs_rds:
            raise ConfigError(
                'You must specify a path to a pairs rds file in your config file.'
            )

        if not filesnpaths.is_file_exists(pairs_rds, dont_raise=True):
            raise ConfigError(
                'The pairs rds file path that was provided does not exist: %s'
                % pairs_rds)

        self.pairs = pd.read_csv(
            utils.save_pairs_table_as_TAB_delimited(pairs_rds),
            sep='\t',
            index_col=0)
Beispiel #5
0
 def check_input_params(self):
     ''' check if two tasks have the same input parameter pointed to different columns'''
     for iparam in self.input_param_dict:
         if len(self.input_param_dict[iparam]) > 1:
             task_iter = iter(self.input_param_dict[iparam])
             task1 = next(task_iter)
             column_name1 = self.param_dataframes[task1].loc[
                 iparam, 'param_name_in_pairs_table']
             mismatch = [
                 (t,
                  self.param_dataframes[t].loc[iparam,
                                               'param_name_in_pairs_table'])
                 for t in task_iter if self.param_dataframes[t].loc[
                     iparam, 'param_name_in_pairs_table'] != column_name1
             ]
             if mismatch:
                 raise ConfigError(
                     'Task files with the same parameters must \
                                    also point to the same column in the pairs \
                                    table, yet there are two or more tasks \
                                    with identical parameters that point to \
                                    different columns in the pairs table. For \
                                    example: the input parameter %s is found in \
                                    tasks %s, %s, but pointing to columns %s, %s, \
                                    respectively.' %
                     (iparam, task1, mismatch[0][0], column_name1,
                      mismatch[0][1]))
Beispiel #6
0
def load_param_table_from_task_file(task_file):
    ''' Load the parameters from the task file as a data frame'''
    f = open(os.path.abspath(task_file)).read().splitlines()
    # removing trailing spaces, skipping commented lines, and getting rid of empty lines
    f = [s.strip() for s in f if not s.startswith('#') and len(s.strip()) > 0]
    # skipping the line in which the module is mentioned
    f = f[1:]

    # converting sequences of spaces and tabs to tabs
    task_lines = [re.sub('[\s\t]{2,}', '\t', s) for s in f]

    col_names = get_task_column_names()
    d = pd.DataFrame(index=range(len(task_lines)), columns=col_names)
    for i in d.index:
        cols = col_names.copy()
        cols.reverse()
        task_file_columns = task_lines[i].split('\t')
        if len(task_file_columns) > len(cols):
            raise ConfigError(
                'The task file should only have up to %s columns, \
                               but on of the task files you provided ("%s") has \
                               %s columns.' %
                (len(cols), task_file, len(task_file_columns)))
        for s in task_file_columns:
            c = cols.pop()
            d.loc[i, c] = s
    # set the param name as the index:
    d.set_index('param', inplace=True)

    return d
Beispiel #7
0
    def __init__(self, args):
        A = lambda x: self.args.__dict__[x
                                         ] if x in self.args.__dict__ else None

        self.args = args
        self.config = A('config')
        self.config_file = A('config_file')
        self.threads = {}
        self.target_files = []
        self.input_param_dict = {
        }  # dictionary to connect input parameters to tasks
        self.output_param_dict = {
        }  # dictionary to connect output parameters to tasks
        self.io_dict = {
        }  # dictionary with output parameters and list of tuples of matching inputs with the format (task, input)
        self.param_dataframes = {}
        self.pairs = None
        self.tasks = {}
        self.modules = {}
        self.dirs_dict = {"LOGS_DIR": "00_LOGS"}
        self.params = {}
        self.subworkflows = []

        if not self.config and not self.config_file:
            raise ConfigError('You must provide a path to a config file.')

        if not self.config:
            filesnpaths.is_file_json_formatted(self.config_file)
            self.config = json.load(open(self.config_file))
Beispiel #8
0
    def get_tasks_dict(self):
        ''' Returns the task dictionary by reading the config file.

        The input in the config file could be either:
          1. A list of paths to task files.
          2. A single task file.
          3. a path to a directory.
        An empty dictionary is returned by default.
        '''

        tasks = {}
        task_list = self.config.get('tasks', [])

        if type(task_list) is not list:
            if type(task_list) is not str:
                raise ConfigError('"tasks" must be provided as a list or as a \
                                   single string in the config file, but you \
                                   provided a "%s"' % type(task_list))
            # check if directory
            if os.path.isdir(task_list):
                # get all the *.task files from the directory
                import glob
                task_list = glob.glob(os.path.join(task_list, '*.task'))
            else:
                # a single path was provided so let's convert to list
                task_list = [task_list]

        task_file_with_bad_suffix = [
            t for t in task_list if not t.endswith('.task')
        ]
        if task_file_with_bad_suffix:
            raise ConfigError(
                'Task files must have suffix ".task", but some of \
                               the task files you provided don\'t. For example: \
                               %s' % task_file_with_bad_suffix[0])

        for t in task_list:
            # make sure task file exist
            filesnpaths.is_file_exists(t)

        tasks = dict([(utils.fix_name(os.path.basename(t)[:-5]), t)
                      for t in task_list])

        return (tasks)
Beispiel #9
0
    def get_rule_param(self, task, param, wildcards):
        param_value = ''

        if param not in self.param_dataframes[task].index:
            task_file = self.tasks[task]
            raise ConfigError('Someone is requesting a parameter that is not \
                               defined in the task file. Here are the details: \
                               The parameter %s was requested for %s, but it \
                               is not listed in the task file: %s' %
                              (param, task, task_file))

        param_column_name = self.get_param_name_from_task_file(task, param)
        if utils.is_param_a_literal(param_column_name):
            # if it is a literal then we simply return the literal value
            param_value = utils.fix_param(param_column_name)

        elif param_column_name in self.pairs.columns:
            # if there is such a column already in the pairs table then we read the value from there
            param_value = self.pairs.loc[wildcards.pair, param_column_name]
            if pd.isna(param_value):
                param_value = ''
        elif param_column_name == self.pairs.index.name:
            # the parameter is the key parameter (usually "pair")
            param_value = wildcards.pair

        if not param_value:
            # get the default value from the task file
            param_value = self.get_default_value_from_task_file(task, param)

        if not param_value:
            if param not in self.output_param_dict:
                raise ConfigError(
                    'The following parameter is missing: "%s" from \
                                   the pairs table for the pair id: "%s". you must \
                                   either populate the pairs table or provide a \
                                   default value in the %s task file.' %
                    (param, wildcards.pair, task))

        if param_value and (self.get_param_type_from_task_file(task, param)
                            == 'path'):
            param_value = utils.fix_path(param_value)

        return (param_value)
Beispiel #10
0
def run_command(cmdline,
                log_file_path,
                first_line_of_log_is_cmdline=True,
                remove_log_file_if_exists=True,
                silent=False):
    """Uses subprocess.call to run your `cmdline`"""

    cmdline = format_cmdline(cmdline)

    filesnpaths.is_output_file_writable(log_file_path)

    if remove_log_file_if_exists and os.path.exists(log_file_path):
        os.remove(log_file_path)

    try:
        if first_line_of_log_is_cmdline:
            with open(log_file_path, "a") as log_file:
                log_file.write('# DATE: %s\n# CMD LINE: %s\n' %
                               (get_date(), ' '.join(cmdline)))

        log_file = open(log_file_path, 'a')
        if not silent:
            print('Running the command: "%s". Log file: %s' %
                  (' '.join(cmdline), log_file_path))
        ret_val = subprocess.call(cmdline,
                                  shell=False,
                                  stdout=log_file,
                                  stderr=subprocess.STDOUT)
        log_file.close()

        if ret_val < 0:
            raise ConfigError(
                "command was terminated. There could be a hint here: %s." %
                log_file_path)
        else:
            return ret_val
    except OSError as e:
        raise ConfigError(
            "command was failed for the following reason: '%s' ('%s')" %
            (e, cmdline))
Beispiel #11
0
    def add_task_definitions(self, workflow_name):
        ''' Iterate through tasks to populate input and output definitions and module paths'''

        if not self.tasks:
            raise ConfigError(
                'You must include at least one task in your config file.')

        for task in self.tasks:
            self.read_task_file(task)

        #self.check_input_params()
        self.populate_io_dict()
        self.update_defaults_using_output_parameters()
        self.update_targets()
Beispiel #12
0
def format_cmdline(cmdline):
    """Takes a cmdline for `run_command` or `run_command_STDIN`, and makes it beautiful."""
    if not cmdline or (not isinstance(cmdline, str)
                       and not isinstance(cmdline, list)):
        raise ConfigError(
            "You made ultis::format_cmdline upset. The parameter you sent to run kinda sucks. It should be string\
                            or list type. Note that the parameter `shell` for subprocess.call in this `run_command` function\
                            is always False, therefore if you send a string type, it will be split into a list prior to being\
                            sent to subprocess.")

    if isinstance(cmdline, str):
        cmdline = [str(x) for x in cmdline.split(' ')]
    else:
        cmdline = [str(x) for x in cmdline]

    return cmdline
Beispiel #13
0
def is_param_a_literal(param):
    ''' Parameters that are surrounded by quotes are considered literals by Flow
    meaning they are used as is, instead of defining a column name in the pairs table
    '''
    if None:
        return False

    if type(param) is not str:
        raise ConfigError(
            'Parameters must be of type %s, but someone provided a parameter of type %s. \
                           This is the kind of error you should never encounter so you might have to contact \
                           one of the developers.')

    for q in ['"', "'"]:
        if param.startswith(q) and param.endswith(q) and len(param) > 1:
            return True

    return False
Beispiel #14
0
def get_snakefile_output_param(task, param, filename, wildcards='pair'):
    s = "        {param} = os.path.realpath(os.path.join({task}_workflow_object.ROOT_DIR, dirs_dict['{task}'], '{wildcards}', '{filename}'))"
    if type(wildcards) == list:
        # this is a place holder in case we would want to use multiple wildcards in the future
        # notice that if we go down this road then we would need to also treat the log definition to contain all wildcards
        # as well as change get_rule_param to be compatible with such a change
        wildcards_str = ', '.join(
            ['{%s}' % wildcard for wildcard in wildcards])
    elif type(wildcards) == str:
        wildcards_str = '{%s}' % wildcards
    else:
        raise ConfigError('Wildcards must be either a single string or a list \
                           of strings, but an object of type %s was provided.'
                          % type(wildcards))
    s = s.format(param=param,
                 task=task,
                 filename=filename,
                 wildcards=wildcards_str)
    return (s)
Beispiel #15
0
def check_for_R_packages(required_packages):
    # Before we do anything let's make sure the user has R installed
    is_program_exists('Rscript')

    # Let's make sure all the required packages are installed
    missing_packages = []

    log_file = filesnpaths.get_temp_file_path()
    for lib in required_packages:
        ret_val = run_command(
            ["Rscript", "-e", "library('%s')" % lib], log_file, silent=True)
        if ret_val != 0:
            missing_packages.append(lib)

    if missing_packages:
        raise ConfigError(
            'The following R packages are required in order to run \
                           this program, but are missing: %s.' %
            ', '.join(missing_packages))
Beispiel #16
0
def is_program_exists(program, dont_raise=False):
    IsExe = lambda p: os.path.isfile(p) and os.access(p, os.X_OK)

    fpath, fname = os.path.split(program)

    if fpath:
        if IsExe(program):
            return program
    else:
        for path in os.environ["PATH"].split(os.pathsep):
            path = os.path.expanduser(path).strip('"')
            exe_file = os.path.join(path, program)
            if IsExe(exe_file):
                return exe_file

    if dont_raise:
        return False

    raise ConfigError("The following software: '%s' needs to be installed on your system, but it doesn't seem to appear\
                        in your path :/ If you are certain you have it on your system (for instance you can run it\
                        by typing '%s' in your terminal window), you may want to send a detailed bug report. Sorry!"\
                        % (program, program))
Beispiel #17
0
    def read_task_file(self, task):
        ''' Populate the param_dataframes, input_param_dict, output_param_dict, and module path by reading the task file'''
        task_file = self.tasks.get(task)

        if not task_file:
            raise ConfigError(
                'No task file was provided for task "%s" in your config file.'
                % task)

        try:
            filesnpaths.is_file_exists(task_file)
        except FilesNPathsError:
            raise ConfigError(
                'The task file "%s" does not exist, and yet it was \
                               provided for task "%s" in your config file' %
                (task_file, task))

        # read the entire parameter table from the task file
        param_dataframe = utils.load_param_table_from_task_file(task_file)

        param_dataframe.index = [
            s.replace('.', '_') for s in param_dataframe.index
        ]

        # make sure that all input params are of "param_type" path or value
        bad_params = [
            p for p in param_dataframe.loc[param_dataframe['io_type'] ==
                                           'input'].index
            if param_dataframe.loc[p, 'param_type'] not in ['path', 'value']
        ]
        if bad_params:
            raise ConfigError(
                'Input parameters must be defined as either "path" \
                               or "value", but your task file "%s" declares the \
                               param "%s" as "%s"' %
                (task_file, bad_params[0], param_dataframe.loc[bad_params[0],
                                                               'param_type']))

        for iparam in param_dataframe.loc[
            (param_dataframe['io_type'] == 'input')
                & (param_dataframe['param_type'] == 'path')].index:
            # check for literals and store them as defaults
            if utils.is_param_a_literal(
                    param_dataframe.loc[iparam, 'param_name_in_pairs_table']):
                # store the literal as the default value
                param_dataframe.loc[iparam,
                                    'default_value'] = param_dataframe.loc[
                                        iparam, 'param_name_in_pairs_table']
                # remove the value from the param_name_in_pairs_table column (since we dont expect such a column to exist
                param_dataframe.loc[iparam, 'param_name_in_pairs_table'] = None

            # populate param dict
            if self.input_param_dict.get(iparam):
                # such a parameter already was defined
                # append this task name
                self.input_param_dict[iparam].append(task)
            else:
                self.input_param_dict[iparam] = [task]

        self.param_dataframes[task] = param_dataframe

        for oparam in param_dataframe.loc[param_dataframe['io_type'] ==
                                          'output'].index:
            # populate param dict
            if self.output_param_dict.get(oparam):
                # such a parameter already was defined
                # in the future we might allow this, but for now we will raise an error
                raise ConfigError(
                    'An output parameter can only be defined once \
                                   for a single task, yet two of your task files \
                                   ("%s" and "%s") define the same output: "%s"\
                                   ' %
                    (task, self.output_param_dict[oparam], oparam))
            else:
                self.output_param_dict[oparam] = task

        # get the module dir path
        self.modules[task] = utils.get_module_path_from_task_file(task_file)
Beispiel #18
0
    def __init__(self, args):
        A = lambda x: self.args.__dict__[x
                                         ] if x in self.args.__dict__ else None

        self.args = args
        self.config = A('config')
        self.config_file = A('config_file')
        self.name = A('name')
        self.output_dir = A('output_dir')
        self.W = WorkflowSuperClass(
            argparse.Namespace(config_file=self.config_file))
        self.W.init(self.name)

        # in the future we might turn this to a list of wildcards
        wildcard = '{pair}'

        if os.path.isdir(self.output_dir):
            raise ConfigError(
                'There is already an output directory %s. We don\'t \
                               like overwriting stuff.' % self.output_dir)
        # get the template
        with open(get_path_to_snakefile_template()) as f:
            template = f.read()

        allparams = {}
        for task in self.W.param_dataframes:
            d = self.W.param_dataframes[task]
            params = []
            for param in d.loc[(d['io_type'] == 'input')
                               & (d['param_type'] == 'value')].index:
                # iterate through non-file inputs (AKA params)
                params.append(get_snakefile_param_definition(task, param))
            param_str = ',\n'.join(params)
            if param_str:
                # we need to add an extra comma at the end of the params that we are adding,
                # because the "output_dir" and "module_path" params are included in the template.
                param_str = param_str + ','

            inputs = []
            for param in d.loc[(d['io_type'] == 'input')
                               & (d['param_type'] == 'path')].index:
                # iterate through "path" inputs (AKA input files)
                inputs.append(get_snakefile_param_definition(task, param))
            input_str = ',\n'.join(inputs)

            if input_str:
                input_str = '    input:\n' + input_str

            outputs = []
            for param, row in d.loc[d['io_type'] == 'output'].iterrows():
                # iterate through outputs (AKA outputs)
                filename = row['param_name_in_pairs_table']
                filename = utils.fix_param(filename)
                outputs.append(
                    get_snakefile_output_param(task, param, filename))
            output_str = ',\n'.join(outputs)

            run_cmd = self.get_shell_command(task)

            format_dict = {
                'task': task,
                'inputs': input_str,
                'outputs': output_str,
                'task_params': param_str,
                'run_cmd': run_cmd,
                'wildcard': wildcard
            }

            snakefile = template.format(**format_dict)
            snakefile = snakefile.replace('<libdir>', '{params.module_path}/')

            snakefile_dir = os.path.join(self.output_dir, task)
            os.makedirs(snakefile_dir, exist_ok=True)
            snakefile_path = os.path.join(snakefile_dir, 'Snakefile')
            with open(snakefile_path, 'w') as f:
                f.write(snakefile)

        # create the main snakefile
        with open(get_path_to_main_snakefile_template()) as f:
            main_template = f.read()
        include_cmd = ''
        for task in self.W.tasks:
            include_cmd = include_cmd + 'include: "' + utils.fix_path(
                os.path.join(self.output_dir, task, 'Snakefile')) + '"' + '\n'

        main_snakefile = main_template.format(name=self.name,
                                              include_cmd=include_cmd)

        snakefile_path = os.path.join(self.output_dir, 'Snakefile')
        print('Writing the main Snakefile for workflow "%s" to: %s' %
              (self.name, snakefile_path))
        with open(snakefile_path, 'w') as f:
            f.write(main_snakefile)