Example #1
0
def get_bustools_rid(params):
    """
    Extract the position of the fastq containig reads from the bustools -x argument.
    The read_id is the first pos of the last triplet in the bc:umi:read string or hard-coded
    for short-hand syntax.
    In: -x 10xv3 -> read_id=1
    In: -x 0,0,16:0,16,26:1,0,0 -> read_id=1
    """
    kb_tech_dict = {
        "10xv2": 1,
        "10xv3": 1,
        "celseq": 1,
        "celseq2": 1,
        "dropseq": 1,
        "scrubseq": 1,
        "indropsv1": 1,
        "indropsv2": 0,
    }
    # Check for occurence of short-hand tech
    bus_regex = "(?<!\S)([0-1],\d*,\d*:){2}([0-1],0,0)(?!\S)"
    bus_regex_short = "(?i)\\b(10XV2|10XV3|CELSEQ|CELSEQ2|DROPSEQ|SCRUBSEQ|INDROPSV1|INDROPSV2)\\b"

    if re.search(bus_regex, params) != None:
        match = re.search(bus_regex, params).group(0)
        read_id = int(match.split(":")[-1].split(",")[0])
    elif re.search(bus_regex_short, params) != None:
        tech = re.search(bus_regex_short, params).group(0)
        read_id = kb_tech_dict[tech.lower()]
    else:
        logger.error(
            "Not a valid BUS(barcode:umi:set) string. Please check -x argument"
        )
        os._exit(1)  # noqa
    return read_id
Example #2
0
def samples2metadata_local(samples: List[str], config: dict, logger) -> dict:
    """
    (try to) get the metadata of local samples
    """
    sampledict = dict()
    for sample in samples:
        if os.path.exists(
                expand(f'{{fastq_dir}}/{sample}.{{fqsuffix}}.gz',
                       **config)[0]):
            sampledict[sample] = dict()
            sampledict[sample]["layout"] = "SINGLE"
        elif all(
                os.path.exists(path) for path in expand(
                    f'{{fastq_dir}}/{sample}_{{fqext}}.{{fqsuffix}}.gz', **
                    config)):
            sampledict[sample] = dict()
            sampledict[sample]["layout"] = "PAIRED"
        elif sample.startswith(('GSM', 'SRX', 'SRR', 'ERR', 'DRR')):
            continue
        else:
            logger.error(
                f"\nsample {sample} was not found..\n"
                f"We checked for SE file:\n"
                f"\t{config['fastq_dir']}/{sample}.{config['fqsuffix']}.gz \n"
                f"and for PE files:\n"
                f"\t{config['fastq_dir']}/{sample}_{config['fqext1']}.{config['fqsuffix']}.gz \n"
                f"\t{config['fastq_dir']}/{sample}_{config['fqext2']}.{config['fqsuffix']}.gz \n"
                f"and since the sample did not start with either GSM, SRX, SRR, ERR, and DRR we "
                f"couldn't find it online..\n")
            raise TerminatedException

    return sampledict
Example #3
0
    def schedule(self):
        """ Schedule jobs that are ready, maximizing cpu usage. """
        try:
            while True:
                # work around so that the wait does not prevent keyboard interrupts
                while not self._open_jobs.wait(1):
                    pass

                # obtain needrun and running jobs in a thread-safe way
                with self._lock:
                    needrun = list(self.open_jobs)
                    running = list(self.running)
                # free the event
                self._open_jobs.clear()

                # handle errors
                if not self.keepgoing and self._errors:
                    logger.info("Will exit after finishing "
                                "currently running jobs.")
                    if not running:
                        self._executor.shutdown()
                        logger.error(_ERROR_MSG_FINAL)
                        return False
                    continue
                # normal shutdown because all jobs have been finished
                if not needrun and not running:
                    self._executor.shutdown()
                    if self._errors:
                        logger.error(_ERROR_MSG_FINAL)
                    return not self._errors

                # continue if no new job needs to be executed
                if not needrun:
                    continue

                logger.debug("Resources before job selection: {}".format(
                    self.resources))
                logger.debug("Ready jobs ({}):\n\t".format(len(needrun)) +
                             "\n\t".join(map(str, needrun)))

                # select jobs by solving knapsack problem
                run = self.job_selector(needrun)
                logger.debug("Selected jobs ({}):\n\t".format(len(run)) +
                             "\n\t".join(map(str, run)))
                # update running jobs
                with self._lock:
                    self.running.update(run)
                logger.debug(
                    "Resources after job selection: {}".format(self.resources))
                # actually run jobs
                for job in run:
                    self.run(job)
        except (KeyboardInterrupt, SystemExit):
            logger.info("Terminating processes on user request.")
            self._executor.cancel()
            with self._lock:
                running = list(self.running)
            for job in running:
                job.cleanup()
            return False
def get_dna_paths(url, ext='', proxy=None, params={}):
    """https://stackoverflow.com/questions/11023530/python-to-list-http-files-and-directories
    """
    if isinstance(proxy, str):
        proxy = {'ftp': proxy, 'http': proxy, 'https': proxy}

    response = requests.get(url, proxies=proxy, params=params)
    if response.ok:
        response_text = response.text
    else:
        logger.error("query failed: {}".format(response.status_code))
        return ''
    soup = BeautifulSoup(response_text, 'html.parser')
    parent = [
        os.path.join(url, node.get('href')) for node in soup.find_all('a')
        if node.get('href').endswith(ext)
    ]

    #1) return primary assembly if present
    for p in parent:
        if p.endswith('primary_assembly.fa.gz'):
            return [p]
    #2) return toplevel if present
    for p in parent:
        if p.endswith('dna.toplevel.fa.gz'):
            return [p]
    #3) else try to fetch individual chromosomes
    chrom = []
    for p in parent:
        if ('dna.chromosome' in p) or ('dna.nonchromosomal' in p):
            chrom.append(p)
    if len(chrom) == 0:
        logger.error("failed to identify any valid dna in: {}".format(
            str(parent)))
    return chrom
    def _get_bucket(self):
        """get a connection to the storage bucket (self.bucket) and exit
        if the name is taken or otherwise invalid.

        Parameters
        ==========
        workflow: the workflow object to derive the prefix from
        """
        import google

        # Hold path to requested subdirectory and main bucket
        bucket_name = self.workflow.default_remote_prefix.split("/")[0]
        self.gs_subdir = re.sub("^{}/".format(bucket_name), "",
                                self.workflow.default_remote_prefix)
        self.gs_logs = os.path.join(self.gs_subdir, "google-lifesciences-logs")

        # Case 1: The bucket already exists
        try:
            self.bucket = self._bucket_service.get_bucket(bucket_name)

        # Case 2: The bucket needs to be created
        except google.cloud.exceptions.NotFound:
            self.bucket = self._bucket_service.create_bucket(bucket_name)

        # Case 2: The bucket name is already taken
        except Exception as ex:
            logger.error("Cannot get or create {} (exit code {}):\n{}".format(
                bucket_name, ex.returncode, ex.output.decode()))
            log_verbose_traceback(ex)
            raise ex

        logger.debug("bucket=%s" % self.bucket.name)
        logger.debug("subdir=%s" % self.gs_subdir)
        logger.debug("logs=%s" % self.gs_logs)
Example #6
0
def color_parser(color: str, color_dicts: list = None) -> tuple:
    """
    convert a string with RGB/matplotlib named colors to matplotlib HSV tuples.

    supports RGB colors with ranges between 0-1 or 0-255.

    supported matplotlib colors can be found here:
    https://matplotlib.org/3.3.1/gallery/color/named_colors.html
    """
    # input: RGB
    if color.count(",") == 2:
        value = [float(c) for c in color.split(",")]
        return rgb_to_hsv(value)

    # input: matplotlib colors
    cdicts = color_dicts if color_dicts else DEFAULT_COLOR_DICTS
    for cdict in cdicts:
        if color in cdict:
            value = cdict[color]

            # tableau, css4 and xkcd return hex colors.
            if str(value).startswith("#"):
                value = hex_to_rgb(value)

            return rgb_to_hsv(value)

    logger.error(f"Color not recognized: {color}")
    os._exit(1)  # noqa
Example #7
0
    def schedule(self):
        """ Schedule jobs that are ready, maximizing cpu usage. """
        try:
            while True:
                # work around so that the wait does not prevent keyboard interrupts
                while not self._open_jobs.wait(1):
                    pass

                # obtain needrun and running jobs in a thread-safe way
                with self._lock:
                    needrun = list(self.open_jobs)
                    running = list(self.running)
                # free the event
                self._open_jobs.clear()

                # handle errors
                if not self.keepgoing and self._errors:
                    logger.info("Will exit after finishing "
                                "currently running jobs.")
                    if not running:
                        self._executor.shutdown()
                        logger.error(_ERROR_MSG_FINAL)
                        return False
                    continue
                # normal shutdown because all jobs have been finished
                if not needrun and not running:
                    self._executor.shutdown()
                    if self._errors:
                        logger.error(_ERROR_MSG_FINAL)
                    return not self._errors

                # continue if no new job needs to be executed
                if not needrun:
                    continue

                logger.debug("Resources before job selection: {}".format(
                    self.resources))
                logger.debug("Ready jobs ({}):\n\t".format(len(needrun)) +
                             "\n\t".join(map(str, needrun)))

                # select jobs by solving knapsack problem
                run = self.job_selector(needrun)
                logger.debug("Selected jobs ({}):\n\t".format(len(run)) +
                             "\n\t".join(map(str, run)))
                # update running jobs
                with self._lock:
                    self.running.update(run)
                logger.debug("Resources after job selection: {}".format(
                    self.resources))
                # actually run jobs
                for job in run:
                    self.run(job)
        except (KeyboardInterrupt, SystemExit):
            logger.info("Terminating processes on user request.")
            self._executor.cancel()
            with self._lock:
                running = list(self.running)
            for job in running:
                job.cleanup()
            return False
def get_release_ensemblgenomes(release=None):
    """returns version of ensemblgenomes matching ensembl release
    """
    try:
        data = _ens_rest_query(ext='/info/eg_version?', release=release)
        return str(int(data['version']))
    except:
        logger.error("release parse error or protocol error at {}".format(
            "ensemblgenomes release query"))
Example #9
0
 def construct_mapping(self, node, deep=False):
     mapping = []
     for key_node, value_node in node.value:
         key = self.construct_object(key_node, deep=deep).lower()
         if key in mapping:
             logger.error(
                 f"Duplicate key found in the config.yaml: {key}\n")
             os._exit(1)  # noqa
         mapping.append(key)
     return super().construct_mapping(node, deep)
def get_release_current_ensembl():
    """returns current version of ensembl
    """
    try:
        data = _ens_rest_query(ext='/info/data?')
        release = int(data['releases'][0])
        return str(int(data['releases'][0]))
    except:
        logger.error("release parse error or protocol error at {}".format(
            "ensembl release query"))
Example #11
0
    def __init__(self):
        # check if it is already initialized
        if self.instance != None:
            self.config = self.instance.config
            self.conf_dict = self.instance.conf_dict
            self.args = self.instance.args
            self.path = self.instance.path
            self.snakefile = self.instance.snakefile
            self.snakeroot = self.instance.snakeroot
            return

        # we dont need the first argument aka call to snakemake
        self.sysargs = sys.argv[1:]

        parser = get_argument_parser()
        self.args = parser.parse_args(self.sysargs)
        self.path = self.args.configfile
        self.snakefile = self.args.snakefile
        self.config = parse_config(self.args)

        if self.path is None:
            for p in ["wbuild.yaml", "config.yaml", "wBuild.yaml"]:
                if os.path.exists(p):
                    self.path = p
                    break
        else:
            if type(self.path) is list:
                self.path = self.path[0]
            self.path = os.path.abspath(self.path)

        # this is taken from the snakemake main file
        if self.snakefile is None:
            for p in SNAKEFILE_CHOICES:
                if os.path.exists(p):
                    self.snakefile = p
                    break
        self.snakeroot = os.path.dirname(self.snakefile)

        #load defaults
        self.loadDefaultConfiguration()

        try:
            fh = open(self.path, "r")
        except IOError:
            raise IOError("Can not read config. Are you sure you have enough "
                          "rights and config path (wbuild.yaml) is right?")
        configDict = next(yaml.safe_load_all(fh))
        if configDict == None:
            logger.error(
                "Error parsing wbuild.yaml - format is wrong. Working with defaults..."
            )
        else:
            self.conf_dict = merge_two_dicts(self.conf_dict, configDict)
        #fill Singleton
        Config.instance = self
Example #12
0
 def log_error(self, msg=None, indent=False, **kwargs):
     logger.job_error(
         name=self.rule.name,
         jobid=self.dag.jobid(self),
         output=list(format_files(self, self.output, self.dynamic_output)),
         log=list(self.log),
         conda_env=self.conda_env.path if self.conda_env else None,
         aux=kwargs,
         indent=indent)
     if msg is not None:
         logger.error(msg)
Example #13
0
def shellcmd(img_path,
             cmd,
             args="",
             envvars=None,
             shell_executable=None,
             container_workdir=None):
    """Execute shell command inside singularity container given optional args
       and environment variables to be passed."""

    # if img_path is given here, why do we have self.path??

    # I suppose this is needed for the very smart inheritance of ENVS by singularity
    # if envvars:
    #     envvars = " ".join("SINGULARITYENV_{}={}".format(k, v)
    #                        for k, v in envvars.items())
    # else:
    #     envvars = ""
    if container_workdir is not None:
        logger.error(
            '--use-docker do not support shadow dir, TO BE IMPLEMENTED')
    # We will handle HERE occam based on an env variable to avoid code duplication
    if shell_executable is None:
        shell_executable = "sh"
    else:
        # Ensure to just use the name of the executable, not a path,
        # because we cannot be sure where it is located in the container.
        shell_executable = os.path.split(shell_executable)[-1]

    # TODO add uid

    wd = os.getcwd()  # this won't work for subworkflows? TODO
    print("*********************", wd)
    if os.environ.get('RUN_ENV') == 'occam':
        #occam-run -v /home/egrassi:/home/egrassi  egrassi/bit_docker:small sh -c "cd /home/egrassi/bit_docker; bmake test_bmaked"
        args_v = re.sub('--volume', '-v', args)
        # workaroud for configargparse problem with --docker-args "-v ..." with -v as a snakemake argument
        cmd = "cd {};".format(wd) + cmd
        cmd = "occam-run {} {} {} -c '{}'".format(args_v, img_path,
                                                  shell_executable,
                                                  cmd.replace("'", r"'\''"))
    else:
        args += " -v {}:{}".format(
            SNAKEMAKE_SEARCHPATH,
            SNAKEMAKE_MOUNTPOINT)  # TODO try to remove and see what breaks
        # TODO we need to mount current dir or do we leave it to the user? user for now, need to mount whole bioinfo_root/equivalent
        args += " --user {}:{} ".format(os.getuid(), os.getgid())
        cmd = "cd {};".format(wd) + cmd
        cmd = "docker run {} {} {} -c '{}'".format(args, img_path,
                                                   shell_executable,
                                                   cmd.replace("'", r"'\''"))

    logger.debug(cmd)
    return cmd
Example #14
0
 def wrap(*args, **kwargs):
     # we get two tries, in case parallel executions are interfering with one another
     for _ in range(2):
         try:
             return func(*args, **kwargs)
         except FileNotFoundError:
             time.sleep(1)
     else:
         logger.error(
             "There were some problems with locking the seq2science cache. Please try again in a bit."
         )
         os._exit(1)  # noqa
def _convert_units_to_mb(memory):
    """If memory is specified with SI unit, convert to MB"""
    if isinstance(memory, int) or isinstance(memory, float):
        return int(memory)
    siunits = {"K": 1e-3, "M": 1, "G": 1e3, "T": 1e6}
    regex = re.compile(r"(\d+)({})$".format("|".join(siunits.keys())))
    m = regex.match(memory)
    if m is None:
        logger.error((f"unsupported memory specification '{memory}';"
                      "  allowed suffixes: [K|M|G|T]"))
        sys.exit(1)
    factor = siunits[m.group(2)]
    return int(int(m.group(1)) * factor)
def get_refseq_assembly_summary(url, release, proxy=None, protocol='https://'):
    if isinstance(proxy, str):
        proxy = {'ftp': proxy, 'http': proxy, 'https': proxy}
    if proxy:
        proxy_support = request.ProxyHandler(proxy)
        request.install_opener(request.build_opener(proxy_support))
    logger.info('fetching {} ...'.format(protocol+ url))     
    with closing(request.urlopen(protocol + url)) as response:
        out = response.read().decode('utf-8')
    try:
        tab = pd.read_csv(io.StringIO(out), sep="\t", skiprows=1)
    except:
        logger.error('failed to parse url into df {}'.format(url))
    return tab
Example #17
0
def get_file_hash(filename, algorithm="sha256"):
    """find the SHA256 hash string of a file. We use this so that the
    user can choose to cache working directories in storage.
    """
    from snakemake.logging import logger

    # The algorithm must be available
    try:
        hasher = hashlib.new(algorithm)
    except ValueError as ex:
        logger.error("%s is not an available algorithm." % algorithm)
        raise ex

    with open(filename, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hasher.update(chunk)
    return hasher.hexdigest()
Example #18
0
    def run(self, job,
            callback=None,
            submit_callback=None,
            error_callback=None):
        super()._run(job)
        workdir = os.getcwd()
        jobid = self.dag.jobid(job)

        jobscript = self.get_jobscript(job)
        jobfinished = os.path.join(self.tmpdir, "{}.jobfinished".format(jobid))
        jobfailed = os.path.join(self.tmpdir, "{}.jobfailed".format(jobid))
        self.spawn_jobscript(job, jobscript,
                             jobfinished=jobfinished,
                             jobfailed=jobfailed)

        deps = " ".join(self.external_jobid[f] for f in job.input
                        if f in self.external_jobid)
        try:
            submitcmd = job.format_wildcards(
                self.submitcmd,
                dependencies=deps,
                cluster=self.cluster_wildcards(job))
        except AttributeError as e:
            raise WorkflowError(str(e), rule=job.rule)
        try:
            ext_jobid = subprocess.check_output(
                '{submitcmd} "{jobscript}"'.format(submitcmd=submitcmd,
                                                   jobscript=jobscript),
                shell=True).decode().split("\n")
        except subprocess.CalledProcessError as ex:
            logger.error("Error submitting jobscript (exit code {}):\n{}".format(
                    ex.returncode, ex.output.decode()))
            error_callback(job)
            return
        if ext_jobid and ext_jobid[0]:
            ext_jobid = ext_jobid[0]
            self.external_jobid.update((f, ext_jobid) for f in job.output)
            logger.debug("Submitted job {} with external jobid {}.".format(
                jobid, ext_jobid))

        submit_callback(job)
        with self.lock:
            self.active_jobs.append(GenericClusterJob(job, callback, error_callback, jobscript, jobfinished, jobfailed))
Example #19
0
def assert_versions(rules_dir):
    """
    For each package, check that the installed version matches the required version
    """
    error = False
    yaml_file = _get_yaml_file(rules_dir)
    versions = _get_yaml_versions(yaml_file)
    for package, required_version in versions.items():
        current_version = _get_current_version(package)
        if current_version is None:
            continue
        if not current_version.startswith(required_version):
            logger.error(
                f"Seq2science requires {package.capitalize()} version {required_version}, "
                f"found version {current_version}.")
            error = True
    if error:
        logger.error("Please create a new conda environment.\n")
        os._exit(1)  # noqa
Example #20
0
def parseYamlParams(header, f):
    """
    :param header: String form of YAML header
    :param f: Filename of a file from where the header was parsed
    :return: Parameters dictionary parsed from the header; None if parsing errors occured
    """
    try:
        param = next(yaml.safe_load_all(header))
    except (yaml.scanner.ScannerError, yaml.parser.ParserError,
            yaml.error.YAMLError, yaml.error.MarkedYAMLError) as e:
        if hasattr(e, 'problem_mark'):
            if e.context != None:
                logger.error('Error while parsing YAML area in the file ' + f +
                             ':\n' + str(e.problem_mark) + '\n  ' +
                             str(e.problem) + ' ' + str(e.context) +
                             '\nPlease correct the header and retry.')
            else:
                logger.error('Error while parsing YAML area in the file ' + f +
                             ':\n' + str(e.problem_mark) + '\n  ' +
                             str(e.problem) +
                             '\nPlease correct the header and retry.')
        else:
            logger.error("YAMLError parsing yaml file.")

        return None
    except Exception as e:
        print(
            bcolors.FAIL + bcolors.BOLD + 'Could not parse', f,
            '. Include valid yaml header. Not showing any further errors. \n',
            'Errors {0}'.format(e) + bcolors.ENDC)
        return None

    logger.debug("Parsed params: " + str(param) + "\n.")
    return param
Example #21
0
def samples2metadata_local(samples: List[str], config: dict, logger) -> dict:
    """
    (try to) get the metadata of local samples
    """
    SAMPLEDICT = dict()
    for sample in samples:
        local_fastqs = glob.glob(
            os.path.join(config["fastq_dir"],
                         f'{sample}*{config["fqsuffix"]}*.gz'))
        if len(local_fastqs) == 1:
            SAMPLEDICT[sample] = dict()
            SAMPLEDICT[sample]["layout"] = "SINGLE"
        elif (len(local_fastqs) == 2 and any([
                config["fqext1"] in os.path.basename(f) for f in local_fastqs
        ]) and any(
            [config["fqext2"] in os.path.basename(f) for f in local_fastqs])):
            SAMPLEDICT[sample] = dict()
            SAMPLEDICT[sample]["layout"] = "PAIRED"
        elif sample.startswith(
            ("GSM", "DRX", "ERX", "SRX", "DRR", "ERR", "SRR")):
            continue
        else:
            extend_msg = ""
            if len(local_fastqs) > 2:
                extend_msg = (
                    f"We found too many files matching ({len(local_fastqs)}) "
                    "and could not distinguish them:\n" +
                    ", ".join([os.path.basename(f)
                               for f in local_fastqs]) + ".\n")

            logger.error(
                f"\nsample {sample} was not found..\n"
                f"We checked directory '{config['fastq_dir']}' "
                f"for gzipped files starting with '{sample}' and containing '{config['fqsuffix']}'.\n"
                + extend_msg +
                f"Since the sample did not start with either GSM, SRX, SRR, ERR, and DRR we "
                f"couldn't find it online..\n")
            os._exit(1)  # noqa

    return SAMPLEDICT
def get_current_refseq_from_www(proxy={}, protocol='https://'):
    url = 'ftp.ncbi.nlm.nih.gov/refseq/release/RELEASE_NUMBER'
    if isinstance(proxy, str):
        proxy = {'ftp': proxy, 'http': proxy, 'https': proxy}
    if proxy:
        proxy_support = request.ProxyHandler(proxy)
        request.install_opener(request.build_opener(proxy_support))       
    with closing(request.urlopen(protocol + url)) as response:
        try:
            logger.info('fetching {} ...'.format(url))
            out = response.read()
            release = str(int(re.findall(b"\d+", out)[0]))
        except: 
            raise logger.error("release parse error or protocol error")
    return str(release)
Example #23
0
def print_exception(ex, linemaps):
    """
    Print an error message for a given exception.

    Arguments
    ex -- the exception
    linemaps -- a dict of a dict that maps for each snakefile
        the compiled lines to source code lines in the snakefile.
    """
    log_verbose_traceback(ex)
    if isinstance(ex, SyntaxError) or isinstance(ex, IndentationError):
        logger.error(
            format_error(ex,
                         ex.lineno,
                         linemaps=linemaps,
                         snakefile=ex.filename,
                         show_traceback=True))
        return
    origin = get_exception_origin(ex, linemaps)
    if origin is not None:
        lineno, file = origin
        logger.error(
            format_error(ex,
                         lineno,
                         linemaps=linemaps,
                         snakefile=file,
                         show_traceback=True))
        return
    elif isinstance(ex, TokenError):
        logger.error(format_error(ex, None, show_traceback=False))
    elif isinstance(ex, MissingRuleException):
        logger.error(
            format_error(ex,
                         None,
                         linemaps=linemaps,
                         snakefile=ex.filename,
                         show_traceback=False))
    elif isinstance(ex, RuleException):
        for e in ex._include + [ex]:
            if not e.omit:
                logger.error(
                    format_error(e,
                                 e.lineno,
                                 linemaps=linemaps,
                                 snakefile=e.filename,
                                 show_traceback=True))
    elif isinstance(ex, WorkflowError):
        logger.error(
            format_error(ex,
                         ex.lineno,
                         linemaps=linemaps,
                         snakefile=ex.snakefile,
                         show_traceback=True))
    elif isinstance(ex, KeyboardInterrupt):
        logger.info("Cancelling snakemake on user request.")
    else:
        traceback.print_exception(type(ex), ex, ex.__traceback__)
Example #24
0
    def execute(
        self,
        targets=None,
        dryrun=False,
        touch=False,
        local_cores=1,
        forcetargets=False,
        forceall=False,
        forcerun=None,
        until=[],
        omit_from=[],
        prioritytargets=None,
        quiet=False,
        keepgoing=False,
        printshellcmds=False,
        printreason=False,
        printdag=False,
        cluster=None,
        cluster_sync=None,
        jobname=None,
        immediate_submit=False,
        ignore_ambiguity=False,
        printrulegraph=False,
        printfilegraph=False,
        printd3dag=False,
        drmaa=None,
        drmaa_log_dir=None,
        kubernetes=None,
        tibanna=None,
        tibanna_sfn=None,
        precommand="",
        tibanna_config=False,
        container_image=None,
        stats=None,
        force_incomplete=False,
        ignore_incomplete=False,
        list_version_changes=False,
        list_code_changes=False,
        list_input_changes=False,
        list_params_changes=False,
        list_untracked=False,
        list_conda_envs=False,
        summary=False,
        archive=None,
        delete_all_output=False,
        delete_temp_output=False,
        detailed_summary=False,
        latency_wait=3,
        wait_for_files=None,
        nolock=False,
        unlock=False,
        notemp=False,
        nodeps=False,
        cleanup_metadata=None,
        conda_cleanup_envs=False,
        cleanup_shadow=False,
        cleanup_scripts=True,
        subsnakemake=None,
        updated_files=None,
        keep_target_files=False,
        keep_shadow=False,
        keep_remote_local=False,
        allowed_rules=None,
        max_jobs_per_second=None,
        max_status_checks_per_second=None,
        greediness=1.0,
        no_hooks=False,
        force_use_threads=False,
        conda_create_envs_only=False,
        assume_shared_fs=True,
        cluster_status=None,
        report=None,
        report_stylesheet=None,
        export_cwl=False,
        batch=None,
        keepincomplete=False,
    ):

        self.check_localrules()
        self.immediate_submit = immediate_submit
        self.cleanup_scripts = cleanup_scripts

        def rules(items):
            return map(self._rules.__getitem__, filter(self.is_rule, items))

        if keep_target_files:

            def files(items):
                return filterfalse(self.is_rule, items)

        else:

            def files(items):
                relpath = lambda f: f if os.path.isabs(f) else os.path.relpath(
                    f)
                return map(relpath, filterfalse(self.is_rule, items))

        if not targets:
            targets = [self.first_rule
                       ] if self.first_rule is not None else list()

        if prioritytargets is None:
            prioritytargets = list()
        if forcerun is None:
            forcerun = list()
        if until is None:
            until = list()
        if omit_from is None:
            omit_from = list()

        priorityrules = set(rules(prioritytargets))
        priorityfiles = set(files(prioritytargets))
        forcerules = set(rules(forcerun))
        forcefiles = set(files(forcerun))
        untilrules = set(rules(until))
        untilfiles = set(files(until))
        omitrules = set(rules(omit_from))
        omitfiles = set(files(omit_from))
        targetrules = set(
            chain(
                rules(targets),
                filterfalse(Rule.has_wildcards, priorityrules),
                filterfalse(Rule.has_wildcards, forcerules),
                filterfalse(Rule.has_wildcards, untilrules),
            ))
        targetfiles = set(
            chain(files(targets), priorityfiles, forcefiles, untilfiles))
        if forcetargets:
            forcefiles.update(targetfiles)
            forcerules.update(targetrules)

        rules = self.rules
        if allowed_rules:
            rules = [rule for rule in rules if rule.name in set(allowed_rules)]

        if wait_for_files is not None:
            try:
                snakemake.io.wait_for_files(wait_for_files,
                                            latency_wait=latency_wait)
            except IOError as e:
                logger.error(str(e))
                return False

        dag = DAG(
            self,
            rules,
            dryrun=dryrun,
            targetfiles=targetfiles,
            targetrules=targetrules,
            # when cleaning up conda, we should enforce all possible jobs
            # since their envs shall not be deleted
            forceall=forceall or conda_cleanup_envs,
            forcefiles=forcefiles,
            forcerules=forcerules,
            priorityfiles=priorityfiles,
            priorityrules=priorityrules,
            untilfiles=untilfiles,
            untilrules=untilrules,
            omitfiles=omitfiles,
            omitrules=omitrules,
            ignore_ambiguity=ignore_ambiguity,
            force_incomplete=force_incomplete,
            ignore_incomplete=ignore_incomplete or printdag or printrulegraph
            or printfilegraph,
            notemp=notemp,
            keep_remote_local=keep_remote_local,
            batch=batch,
        )

        self.persistence = Persistence(
            nolock=nolock,
            dag=dag,
            conda_prefix=self.conda_prefix,
            singularity_prefix=self.singularity_prefix,
            shadow_prefix=self.shadow_prefix,
            warn_only=dryrun or printrulegraph or printfilegraph or printdag
            or summary or archive or list_version_changes or list_code_changes
            or list_input_changes or list_params_changes or list_untracked
            or delete_all_output or delete_temp_output,
        )

        if cleanup_metadata:
            for f in cleanup_metadata:
                self.persistence.cleanup_metadata(f)
            return True

        logger.info("Building DAG of jobs...")
        dag.init()
        dag.update_checkpoint_dependencies()
        # check incomplete has to run BEFORE any call to postprocess
        dag.check_incomplete()
        dag.check_dynamic()

        if unlock:
            try:
                self.persistence.cleanup_locks()
                logger.info("Unlocking working directory.")
                return True
            except IOError:
                logger.error("Error: Unlocking the directory {} failed. Maybe "
                             "you don't have the permissions?")
                return False
        try:
            self.persistence.lock()
        except IOError:
            logger.error(
                "Error: Directory cannot be locked. Please make "
                "sure that no other Snakemake process is trying to create "
                "the same files in the following directory:\n{}\n"
                "If you are sure that no other "
                "instances of snakemake are running on this directory, "
                "the remaining lock was likely caused by a kill signal or "
                "a power loss. It can be removed with "
                "the --unlock argument.".format(os.getcwd()))
            return False

        if cleanup_shadow:
            self.persistence.cleanup_shadow()
            return True

        if (self.subworkflows and not printdag and not printrulegraph
                and not printfilegraph):
            # backup globals
            globals_backup = dict(self.globals)
            # execute subworkflows
            for subworkflow in self.subworkflows:
                subworkflow_targets = subworkflow.targets(dag)
                logger.debug(
                    "Files requested from subworkflow:\n    {}".format(
                        "\n    ".join(subworkflow_targets)))
                updated = list()
                if subworkflow_targets:
                    logger.info("Executing subworkflow {}.".format(
                        subworkflow.name))
                    if not subsnakemake(
                            subworkflow.snakefile,
                            workdir=subworkflow.workdir,
                            targets=subworkflow_targets,
                            configfiles=[subworkflow.configfile]
                            if subworkflow.configfile else None,
                            updated_files=updated,
                    ):
                        return False
                    dag.updated_subworkflow_files.update(
                        subworkflow.target(f) for f in updated)
                else:
                    logger.info("Subworkflow {}: Nothing to be done.".format(
                        subworkflow.name))
            if self.subworkflows:
                logger.info("Executing main workflow.")
            # rescue globals
            self.globals.update(globals_backup)

        dag.postprocess()
        # deactivate IOCache such that from now on we always get updated
        # size, existence and mtime information
        # ATTENTION: this may never be removed without really good reason.
        # Otherwise weird things may happen.
        self.iocache.deactivate()
        # clear and deactivate persistence cache, from now on we want to see updates
        self.persistence.deactivate_cache()

        if nodeps:
            missing_input = [
                f for job in dag.targetjobs for f in job.input
                if dag.needrun(job) and not os.path.exists(f)
            ]
            if missing_input:
                logger.error(
                    "Dependency resolution disabled (--nodeps) "
                    "but missing input "
                    "files detected. If this happens on a cluster, please make sure "
                    "that you handle the dependencies yourself or turn off "
                    "--immediate-submit. Missing input files:\n{}".format(
                        "\n".join(missing_input)))
                return False

        updated_files.extend(f for job in dag.needrun_jobs for f in job.output)

        if export_cwl:
            from snakemake.cwl import dag_to_cwl
            import json

            with open(export_cwl, "w") as cwl:
                json.dump(dag_to_cwl(dag), cwl, indent=4)
            return True
        elif report:
            from snakemake.report import auto_report

            auto_report(dag, report, stylesheet=report_stylesheet)
            return True
        elif printd3dag:
            dag.d3dag()
            return True
        elif printdag:
            print(dag)
            return True
        elif printrulegraph:
            print(dag.rule_dot())
            return True
        elif printfilegraph:
            print(dag.filegraph_dot())
            return True
        elif summary:
            print("\n".join(dag.summary(detailed=False)))
            return True
        elif detailed_summary:
            print("\n".join(dag.summary(detailed=True)))
            return True
        elif archive:
            dag.archive(archive)
            return True
        elif delete_all_output:
            dag.clean(only_temp=False, dryrun=dryrun)
            return True
        elif delete_temp_output:
            dag.clean(only_temp=True, dryrun=dryrun)
            return True
        elif list_version_changes:
            items = list(
                chain(*map(self.persistence.version_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_code_changes:
            items = list(chain(*map(self.persistence.code_changed, dag.jobs)))
            for j in dag.jobs:
                items.extend(list(j.outputs_older_than_script_or_notebook()))
            if items:
                print(*items, sep="\n")
            return True
        elif list_input_changes:
            items = list(chain(*map(self.persistence.input_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_params_changes:
            items = list(
                chain(*map(self.persistence.params_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_untracked:
            dag.list_untracked()
            return True

        if self.use_singularity:
            if assume_shared_fs:
                dag.pull_container_imgs(dryrun=dryrun or list_conda_envs,
                                        quiet=list_conda_envs)
        if self.use_conda:
            if assume_shared_fs:
                dag.create_conda_envs(
                    dryrun=dryrun or list_conda_envs or conda_cleanup_envs,
                    quiet=list_conda_envs,
                )
            if conda_create_envs_only:
                return True

        if list_conda_envs:
            print("environment", "container", "location", sep="\t")
            for env in set(job.conda_env for job in dag.jobs):
                if env:
                    print(
                        simplify_path(env.file),
                        env.container_img_url or "",
                        simplify_path(env.path),
                        sep="\t",
                    )
            return True

        if conda_cleanup_envs:
            self.persistence.conda_cleanup_envs()
            return True

        scheduler = JobScheduler(
            self,
            dag,
            self.cores,
            local_cores=local_cores,
            dryrun=dryrun,
            touch=touch,
            cluster=cluster,
            cluster_status=cluster_status,
            cluster_config=cluster_config,
            cluster_sync=cluster_sync,
            jobname=jobname,
            max_jobs_per_second=max_jobs_per_second,
            max_status_checks_per_second=max_status_checks_per_second,
            quiet=quiet,
            keepgoing=keepgoing,
            drmaa=drmaa,
            drmaa_log_dir=drmaa_log_dir,
            kubernetes=kubernetes,
            tibanna=tibanna,
            tibanna_sfn=tibanna_sfn,
            precommand=precommand,
            tibanna_config=tibanna_config,
            container_image=container_image,
            printreason=printreason,
            printshellcmds=printshellcmds,
            latency_wait=latency_wait,
            greediness=greediness,
            force_use_threads=force_use_threads,
            assume_shared_fs=assume_shared_fs,
            keepincomplete=keepincomplete,
        )

        if not dryrun:
            if len(dag):
                shell_exec = shell.get_executable()
                if shell_exec is not None:
                    logger.info("Using shell: {}".format(shell_exec))
                if cluster or cluster_sync or drmaa:
                    logger.resources_info("Provided cluster nodes: {}".format(
                        self.nodes))
                else:
                    warning = ("" if self.cores > 1 else
                               " (use --cores to define parallelism)")
                    logger.resources_info("Provided cores: {}{}".format(
                        self.cores, warning))
                    logger.resources_info("Rules claiming more threads "
                                          "will be scaled down.")

                provided_resources = format_resources(self.global_resources)
                if provided_resources:
                    logger.resources_info("Provided resources: " +
                                          provided_resources)

                if self.run_local and any(rule.group for rule in self.rules):
                    logger.info("Group jobs: inactive (local execution)")

                if not self.use_conda and any(rule.conda_env
                                              for rule in self.rules):
                    logger.info("Conda environments: ignored")

                if not self.use_singularity and any(rule.container_img
                                                    for rule in self.rules):
                    logger.info("Singularity containers: ignored")

                logger.run_info("\n".join(dag.stats()))
            else:
                logger.info("Nothing to be done.")
        else:
            # the dryrun case
            if len(dag):
                logger.run_info("\n".join(dag.stats()))
            else:
                logger.info("Nothing to be done.")
                return True
            if quiet:
                # in case of dryrun and quiet, just print above info and exit
                return True

        if not dryrun and not no_hooks:
            self._onstart(logger.get_logfile())

        success = scheduler.schedule()

        if success:
            if dryrun:
                if len(dag):
                    logger.run_info("\n".join(dag.stats()))
                logger.info("This was a dry-run (flag -n). The order of jobs "
                            "does not reflect the order of execution.")
                logger.remove_logfile()
            else:
                if stats:
                    scheduler.stats.to_json(stats)
                logger.logfile_hint()
            if not dryrun and not no_hooks:
                self._onsuccess(logger.get_logfile())
            return True
        else:
            if not dryrun and not no_hooks:
                self._onerror(logger.get_logfile())
            logger.logfile_hint()
            return False
Example #25
0
    def schedule(self):
        """ Schedule jobs that are ready, maximizing cpu usage. """

        try:
            while True:
                # work around so that the wait does not prevent keyboard interrupts
                # while not self._open_jobs.acquire(False):
                #    time.sleep(1)
                self._open_jobs.acquire()

                # obtain needrun and running jobs in a thread-safe way
                with self._lock:
                    needrun = list(self.open_jobs)
                    running = list(self.running)
                    errors = self._errors
                    user_kill = self._user_kill

                # handle errors
                if user_kill or (not self.keepgoing and errors):
                    if user_kill == "graceful":
                        logger.info("Will exit after finishing "
                                    "currently running jobs.")

                    if not running:
                        logger.info(
                            "Shutting down, this might take some time.")
                        self._executor.shutdown()
                        if not user_kill:
                            logger.error(_ERROR_MSG_FINAL)
                        return False
                    continue

                # normal shutdown because all jobs have been finished
                if not needrun and (not running
                                    or self.workflow.immediate_submit):
                    self._executor.shutdown()
                    if errors:
                        logger.error(_ERROR_MSG_FINAL)
                    return not errors

                # continue if no new job needs to be executed
                if not needrun:
                    continue

                # select jobs by solving knapsack problem (omit with dryrun)
                if self.dryrun:
                    run = needrun
                else:
                    logger.debug("Resources before job selection: {}".format(
                        self.resources))
                    logger.debug("Ready jobs ({}):\n\t".format(len(needrun)) +
                                 "\n\t".join(map(str, needrun)))
                    run = self.job_selector(needrun)
                    logger.debug("Selected jobs ({}):\n\t".format(len(run)) +
                                 "\n\t".join(map(str, run)))
                    logger.debug("Resources after job selection: {}".format(
                        self.resources))
                # update running jobs
                with self._lock:
                    self.running.update(run)
                # actually run jobs
                for job in run:
                    with self.rate_limiter:
                        self.run(job)
        except (KeyboardInterrupt, SystemExit):
            logger.info(
                "Terminating processes on user request, this might take some time."
            )
            self._executor.cancel()
            return False
Example #26
0
    def execute(self,
                targets=None,
                dryrun=False,
                touch=False,
                cores=1,
                nodes=1,
                local_cores=1,
                forcetargets=False,
                forceall=False,
                forcerun=None,
                prioritytargets=None,
                quiet=False,
                keepgoing=False,
                printshellcmds=False,
                printreason=False,
                printdag=False,
                cluster=None,
                cluster_config=None,
                cluster_sync=None,
                jobname=None,
                immediate_submit=False,
                ignore_ambiguity=False,
                printrulegraph=False,
                printd3dag=False,
                drmaa=None,
                stats=None,
                force_incomplete=False,
                ignore_incomplete=False,
                list_version_changes=False,
                list_code_changes=False,
                list_input_changes=False,
                list_params_changes=False,
                summary=False,
                detailed_summary=False,
                latency_wait=3,
                benchmark_repeats=3,
                wait_for_files=None,
                nolock=False,
                unlock=False,
                resources=None,
                notemp=False,
                nodeps=False,
                cleanup_metadata=None,
                subsnakemake=None,
                updated_files=None,
                keep_target_files=False,
                allowed_rules=None,
                greediness=1.0,
                no_hooks=False):

        self.global_resources = dict() if resources is None else resources
        self.global_resources["_cores"] = cores
        self.global_resources["_nodes"] = nodes

        def rules(items):
            return map(self._rules.__getitem__, filter(self.is_rule, items))

        if keep_target_files:

            def files(items):
                return filterfalse(self.is_rule, items)
        else:

            def files(items):
                return map(os.path.relpath, filterfalse(self.is_rule, items))

        if not targets:
            targets = [self.first_rule
                       ] if self.first_rule is not None else list()
        if prioritytargets is None:
            prioritytargets = list()
        if forcerun is None:
            forcerun = list()

        priorityrules = set(rules(prioritytargets))
        priorityfiles = set(files(prioritytargets))
        forcerules = set(rules(forcerun))
        forcefiles = set(files(forcerun))
        targetrules = set(chain(rules(targets),
                                filterfalse(Rule.has_wildcards, priorityrules),
                                filterfalse(Rule.has_wildcards, forcerules)))
        targetfiles = set(chain(files(targets), priorityfiles, forcefiles))
        if forcetargets:
            forcefiles.update(targetfiles)
            forcerules.update(targetrules)

        rules = self.rules
        if allowed_rules:
            rules = [rule for rule in rules if rule.name in set(allowed_rules)]

        if wait_for_files is not None:
            try:
                snakemake.io.wait_for_files(wait_for_files,
                                            latency_wait=latency_wait)
            except IOError as e:
                logger.error(str(e))
                return False

        dag = DAG(
            self, rules,
            dryrun=dryrun,
            targetfiles=targetfiles,
            targetrules=targetrules,
            forceall=forceall,
            forcefiles=forcefiles,
            forcerules=forcerules,
            priorityfiles=priorityfiles,
            priorityrules=priorityrules,
            ignore_ambiguity=ignore_ambiguity,
            force_incomplete=force_incomplete,
            ignore_incomplete=ignore_incomplete or printdag or printrulegraph,
            notemp=notemp)

        self.persistence = Persistence(
            nolock=nolock,
            dag=dag,
            warn_only=dryrun or printrulegraph or printdag or summary or
            list_version_changes or list_code_changes or list_input_changes or
            list_params_changes)

        if cleanup_metadata:
            for f in cleanup_metadata:
                self.persistence.cleanup_metadata(f)
            return True

        dag.init()
        dag.check_dynamic()

        if unlock:
            try:
                self.persistence.cleanup_locks()
                logger.info("Unlocking working directory.")
                return True
            except IOError:
                logger.error("Error: Unlocking the directory {} failed. Maybe "
                             "you don't have the permissions?")
                return False
        try:
            self.persistence.lock()
        except IOError:
            logger.error(
                "Error: Directory cannot be locked. Please make "
                "sure that no other Snakemake process is trying to create "
                "the same files in the following directory:\n{}\n"
                "If you are sure that no other "
                "instances of snakemake are running on this directory, "
                "the remaining lock was likely caused by a kill signal or "
                "a power loss. It can be removed with "
                "the --unlock argument.".format(os.getcwd()))
            return False

        if self.subworkflows and not printdag and not printrulegraph:
            # backup globals
            globals_backup = dict(self.globals)
            # execute subworkflows
            for subworkflow in self.subworkflows:
                subworkflow_targets = subworkflow.targets(dag)
                updated = list()
                if subworkflow_targets:
                    logger.info(
                        "Executing subworkflow {}.".format(subworkflow.name))
                    if not subsnakemake(subworkflow.snakefile,
                                        workdir=subworkflow.workdir,
                                        targets=subworkflow_targets,
                                        updated_files=updated):
                        return False
                    dag.updated_subworkflow_files.update(subworkflow.target(f)
                                                         for f in updated)
                else:
                    logger.info("Subworkflow {}: Nothing to be done.".format(
                        subworkflow.name))
            if self.subworkflows:
                logger.info("Executing main workflow.")
            # rescue globals
            self.globals.update(globals_backup)

        dag.check_incomplete()
        dag.postprocess()

        if nodeps:
            missing_input = [f for job in dag.targetjobs for f in job.input
                             if dag.needrun(job) and not os.path.exists(f)]
            if missing_input:
                logger.error(
                    "Dependency resolution disabled (--nodeps) "
                    "but missing input "
                    "files detected. If this happens on a cluster, please make sure "
                    "that you handle the dependencies yourself or turn of "
                    "--immediate-submit. Missing input files:\n{}".format(
                        "\n".join(missing_input)))
                return False

        updated_files.extend(f for job in dag.needrun_jobs for f in job.output)

        if printd3dag:
            dag.d3dag()
            return True
        elif printdag:
            print(dag)
            return True
        elif printrulegraph:
            print(dag.rule_dot())
            return True
        elif summary:
            print("\n".join(dag.summary(detailed=False)))
            return True
        elif detailed_summary:
            print("\n".join(dag.summary(detailed=True)))
            return True
        elif list_version_changes:
            items = list(
                chain(*map(self.persistence.version_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_code_changes:
            items = list(chain(*map(self.persistence.code_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_input_changes:
            items = list(chain(*map(self.persistence.input_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_params_changes:
            items = list(
                chain(*map(self.persistence.params_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True

        scheduler = JobScheduler(self, dag, cores,
                                 local_cores=local_cores,
                                 dryrun=dryrun,
                                 touch=touch,
                                 cluster=cluster,
                                 cluster_config=cluster_config,
                                 cluster_sync=cluster_sync,
                                 jobname=jobname,
                                 immediate_submit=immediate_submit,
                                 quiet=quiet,
                                 keepgoing=keepgoing,
                                 drmaa=drmaa,
                                 printreason=printreason,
                                 printshellcmds=printshellcmds,
                                 latency_wait=latency_wait,
                                 benchmark_repeats=benchmark_repeats,
                                 greediness=greediness)

        if not dryrun and not quiet:
            if len(dag):
                if cluster or cluster_sync or drmaa:
                    logger.resources_info(
                        "Provided cluster nodes: {}".format(nodes))
                else:
                    logger.resources_info("Provided cores: {}".format(cores))
                    logger.resources_info("Rules claiming more threads will be scaled down.")
                provided_resources = format_resources(resources)
                if provided_resources:
                    logger.resources_info(
                        "Provided resources: " + provided_resources)
                ignored_resources = format_resource_names(
                    set(resource for job in dag.needrun_jobs for resource in
                        job.resources_dict if resource not in resources))
                if ignored_resources:
                    logger.resources_info(
                        "Ignored resources: " + ignored_resources)
                logger.run_info("\n".join(dag.stats()))
            else:
                logger.info("Nothing to be done.")
        if dryrun and not len(dag):
            logger.info("Nothing to be done.")

        success = scheduler.schedule()

        if success:
            if dryrun:
                if not quiet and len(dag):
                    logger.run_info("\n".join(dag.stats()))
            elif stats:
                scheduler.stats.to_json(stats)
            if not dryrun and not no_hooks:
                self._onsuccess(logger.get_logfile())
            return True
        else:
            if not dryrun and not no_hooks:
                self._onerror(logger.get_logfile())
            return False
Example #27
0
def workflow_parser(args_dict):
    """"""
    # Unlock locked dir and exit
    if args_dict.get("unlock", False):
        logger.warning("Unlocking working directory")
        unlock_dir(workdir=args_dict["workdir"])
        sys.exit()

    # Generate templates and exit
    if args_dict.get("generate_template", False):
        logger.warning("Generate template files in working directory")
        generate_template(workflow_dir=WORKFLOW_DIR,
                          templates=args_dict["generate_template"],
                          workflow=args_dict["subcommand"],
                          workdir=args_dict["workdir"],
                          overwrite=args_dict["overwrite_template"],
                          verbose=args_dict["verbose"],
                          quiet=args_dict["quiet"])
        sys.exit()

    # Cluster stuff to simplify options
    if args_dict["cluster_config"]:
        logger.warning("INITIALISING WORKFLOW IN CLUSTER MODE")
        cluster_config_fn = args_dict["cluster_config"]
        args_dict["local_cores"] = get_yaml_val(yaml_fn=cluster_config_fn,
                                                val_name="cluster_cores",
                                                default=10000)
        args_dict["nodes"] = get_yaml_val(yaml_fn=cluster_config_fn,
                                          val_name="cluster_nodes",
                                          default=500)
        args_dict["cluster"] = get_yaml_val(yaml_fn=cluster_config_fn,
                                            val_name="cluster_cmd")
        args_dict["config"] = args_dict["cluster_config"]
        logger.debug("Cores:{} / Nodes:{} / Cluster_cmd:{}".format(
            args_dict['local_cores'], args_dict['nodes'],
            args_dict['cluster']))
    elif args_dict["config"]:
        logger.warning("INITIALISING WORKFLOW IN LOCAL MODE")
    else:
        logger.error(
            "A configuration file `--config` or a cluster configuration file `--cluster_config` is required"
        )
        sys.exit()

    # Get and check config files
    logger.warning("LOADING CONFIGURATIONS INFO")
    snakefile = get_snakefile_fn(workflow_dir=WORKFLOW_DIR,
                                 workflow=args_dict["subcommand"])
    configfile = get_config_fn(config=args_dict["config"])
    kwargs = filter_out_options(args_dict)
    logger.debug(kwargs)

    # Run Snakemake API
    try:
        snakemake(snakefile=snakefile,
                  configfiles=[configfile],
                  use_conda=True,
                  wrapper_prefix=WRAPPER_PREFIX,
                  **kwargs)
    except TypeError as E:
        logger.error("Unsupported Option Error. {}".format(E))
        sys.exit()
Example #28
0
def snakemake(snakefile,
              listrules=False,
              list_target_rules=False,
              cores=1,
              nodes=1,
              local_cores=1,
              resources=dict(),
              config=dict(),
              configfile=None,
              config_args=None,
              workdir=None,
              targets=None,
              dryrun=False,
              touch=False,
              forcetargets=False,
              forceall=False,
              forcerun=[],
              prioritytargets=[],
              stats=None,
              printreason=False,
              printshellcmds=False,
              printdag=False,
              printrulegraph=False,
              printd3dag=False,
              nocolor=False,
              quiet=False,
              keepgoing=False,
              cluster=None,
              cluster_config=None,
              cluster_sync=None,
              drmaa=None,
              jobname="snakejob.{rulename}.{jobid}.sh",
              immediate_submit=False,
              standalone=False,
              ignore_ambiguity=False,
              snakemakepath=None,
              lock=True,
              unlock=False,
              cleanup_metadata=None,
              force_incomplete=False,
              ignore_incomplete=False,
              list_version_changes=False,
              list_code_changes=False,
              list_input_changes=False,
              list_params_changes=False,
              list_resources=False,
              summary=False,
              detailed_summary=False,
              latency_wait=3,
              benchmark_repeats=1,
              wait_for_files=None,
              print_compilation=False,
              debug=False,
              notemp=False,
              nodeps=False,
              keep_target_files=False,
              allowed_rules=None,
              jobscript=None,
              timestamp=False,
              greediness=None,
              no_hooks=False,
              overwrite_shellcmd=None,
              updated_files=None,
              log_handler=None,
              keep_logger=False,
              verbose=False):
    """Run snakemake on a given snakefile.

    This function provides access to the whole snakemake functionality. It is not thread-safe.

    Args:
        snakefile (str):            the path to the snakefile
        listrules (bool):           list rules (default False)
        list_target_rules (bool):   list target rules (default False)
        cores (int):                the number of provided cores (ignored when using cluster support) (default 1)
        nodes (int):                the number of provided cluster nodes (ignored without cluster support) (default 1)
        local_cores (int):                the number of provided local cores if in cluster mode (ignored without cluster support) (default 1)
        resources (dict):           provided resources, a dictionary assigning integers to resource names, e.g. {gpu=1, io=5} (default {})
        config (dict):              override values for workflow config
        workdir (str):              path to working directory (default None)
        targets (list):             list of targets, e.g. rule or file names (default None)
        dryrun (bool):              only dry-run the workflow (default False)
        touch (bool):               only touch all output files if present (default False)
        forcetargets (bool):        force given targets to be re-created (default False)
        forceall (bool):            force all output files to be re-created (default False)
        forcerun (list):             list of files and rules that shall be re-created/re-executed (default [])
        prioritytargets (list):     list of targets that shall be run with maximum priority (default [])
        stats (str):                path to file that shall contain stats about the workflow execution (default None)
        printreason (bool):         print the reason for the execution of each job (default false)
        printshellcmds (bool):      print the shell command of each job (default False)
        printdag (bool):            print the dag in the graphviz dot language (default False)
        printrulegraph (bool):      print the graph of rules in the graphviz dot language (default False)
        printd3dag (bool):          print a D3.js compatible JSON representation of the DAG (default False)
        nocolor (bool):             do not print colored output (default False)
        quiet (bool):               do not print any default job information (default False)
        keepgoing (bool):           keep goind upon errors (default False)
        cluster (str):              submission command of a cluster or batch system to use, e.g. qsub (default None)
        cluster_config (str):       configuration file for cluster options (default None)
        cluster_sync (str):         blocking cluster submission command (like SGE 'qsub -sync y')  (default None)
        drmaa (str):                if not None use DRMAA for cluster support, str specifies native args passed to the cluster when submitting a job
        jobname (str):              naming scheme for cluster job scripts (default "snakejob.{rulename}.{jobid}.sh")
        immediate_submit (bool):    immediately submit all cluster jobs, regardless of dependencies (default False)
        standalone (bool):          kill all processes very rudely in case of failure (do not use this if you use this API) (default False)
        ignore_ambiguity (bool):    ignore ambiguous rules and always take the first possible one (default False)
        snakemakepath (str):        path to the snakemake executable (default None)
        lock (bool):                lock the working directory when executing the workflow (default True)
        unlock (bool):              just unlock the working directory (default False)
        cleanup_metadata (bool):    just cleanup metadata of output files (default False)
        force_incomplete (bool):    force the re-creation of incomplete files (default False)
        ignore_incomplete (bool):   ignore incomplete files (default False)
        list_version_changes (bool): list output files with changed rule version (default False)
        list_code_changes (bool):   list output files with changed rule code (default False)
        list_input_changes (bool):  list output files with changed input files (default False)
        list_params_changes (bool): list output files with changed params (default False)
        summary (bool):             list summary of all output files and their status (default False)
        latency_wait (int):         how many seconds to wait for an output file to appear after the execution of a job, e.g. to handle filesystem latency (default 3)
        benchmark_repeats (int):    number of repeated runs of a job if declared for benchmarking (default 1)
        wait_for_files (list):      wait for given files to be present before executing the workflow
        list_resources (bool):      list resources used in the workflow (default False)
        summary (bool):             list summary of all output files and their status (default False). If no option  is specified a basic summary will be ouput. If 'detailed' is added as an option e.g --summary detailed, extra info about the input and shell commands will be included
        detailed_summary (bool):    list summary of all input and output files and their status (default False)
        print_compilation (bool):   print the compilation of the snakefile (default False)
        debug (bool):               allow to use the debugger within rules
        notemp (bool):              ignore temp file flags, e.g. do not delete output files marked as temp after use (default False)
        nodeps (bool):              ignore dependencies (default False)
        keep_target_files (bool):   Do not adjust the paths of given target files relative to the working directory.
        allowed_rules (set):        Restrict allowed rules to the given set. If None or empty, all rules are used.
        jobscript (str):            path to a custom shell script template for cluster jobs (default None)
        timestamp (bool):           print time stamps in front of any output (default False)
        greediness (float):         set the greediness of scheduling. This value between 0 and 1 determines how careful jobs are selected for execution. The default value (0.5 if prioritytargets are used, 1.0 else) provides the best speed and still acceptable scheduling quality.
        overwrite_shellcmd (str):   a shell command that shall be executed instead of those given in the workflow. This is for debugging purposes only.
        updated_files(list):        a list that will be filled with the files that are updated or created during the workflow execution
        verbose(bool):              show additional debug output (default False)
        log_handler (function):     redirect snakemake output to this custom log handler, a function that takes a log message dictionary (see below) as its only argument (default None). The log message dictionary for the log handler has to following entries:

            :level:
                the log level ("info", "error", "debug", "progress", "job_info")

            :level="info", "error" or "debug":
                :msg:
                    the log message
            :level="progress":
                :done:
                    number of already executed jobs

                :total:
                    number of total jobs

            :level="job_info":
                :input:
                    list of input files of a job

                :output:
                    list of output files of a job

                :log:
                    path to log file of a job

                :local:
                    whether a job is executed locally (i.e. ignoring cluster)

                :msg:
                    the job message

                :reason:
                    the job reason

                :priority:
                    the job priority

                :threads:
                    the threads of the job


    Returns:
        bool:   True if workflow execution was successful.

    """

    if updated_files is None:
        updated_files = list()

    if cluster or cluster_sync or drmaa:
        cores = sys.maxsize
    else:
        nodes = sys.maxsize

    if cluster_config:
        cluster_config = load_configfile(cluster_config)
    else:
        cluster_config = dict()

    if not keep_logger:
        setup_logger(handler=log_handler,
                     quiet=quiet,
                     printreason=printreason,
                     printshellcmds=printshellcmds,
                     nocolor=nocolor,
                     stdout=dryrun,
                     debug=verbose,
                     timestamp=timestamp)

    if greediness is None:
        greediness = 0.5 if prioritytargets else 1.0
    else:
        if not (0 <= greediness <= 1.0):
            logger.error("Error: greediness must be a float between 0 and 1.")
            return False

    if not os.path.exists(snakefile):
        logger.error("Error: Snakefile \"{}\" not present.".format(snakefile))
        return False
    snakefile = os.path.abspath(snakefile)

    cluster_mode = (cluster is not None) + (cluster_sync is not
                                            None) + (drmaa is not None)
    if cluster_mode > 1:
        logger.error("Error: cluster and drmaa args are mutually exclusive")
        return False
    if debug and (cores > 1 or cluster_mode):
        logger.error(
            "Error: debug mode cannot be used with more than one core or cluster execution.")
        return False

    overwrite_config = dict()
    if configfile:
        overwrite_config.update(load_configfile(configfile))
    if config:
        overwrite_config.update(config)

    if workdir:
        olddir = os.getcwd()
        if not os.path.exists(workdir):
            logger.info(
                "Creating specified working directory {}.".format(workdir))
            os.makedirs(workdir)
        workdir = os.path.abspath(workdir)
        os.chdir(workdir)
    workflow = Workflow(snakefile=snakefile,
                        snakemakepath=snakemakepath,
                        jobscript=jobscript,
                        overwrite_shellcmd=overwrite_shellcmd,
                        overwrite_config=overwrite_config,
                        overwrite_workdir=workdir,
                        overwrite_configfile=configfile,
                        config_args=config_args,
                        debug=debug)

    if standalone:
        try:
            # set the process group
            os.setpgrp()
        except:
            # ignore: if it does not work we can still work without it
            pass

    success = True
    try:
        workflow.include(snakefile,
                         overwrite_first_rule=True,
                         print_compilation=print_compilation)
        workflow.check()

        if not print_compilation:
            if listrules:
                workflow.list_rules()
            elif list_target_rules:
                workflow.list_rules(only_targets=True)
            elif list_resources:
                workflow.list_resources()
            else:
                # if not printdag and not printrulegraph:
                # handle subworkflows
                subsnakemake = partial(snakemake,
                                       cores=cores,
                                       nodes=nodes,
                                       local_cores=local_cores,
                                       resources=resources,
                                       dryrun=dryrun,
                                       touch=touch,
                                       printreason=printreason,
                                       printshellcmds=printshellcmds,
                                       nocolor=nocolor,
                                       quiet=quiet,
                                       keepgoing=keepgoing,
                                       cluster=cluster,
                                       cluster_config=cluster_config,
                                       cluster_sync=cluster_sync,
                                       drmaa=drmaa,
                                       jobname=jobname,
                                       immediate_submit=immediate_submit,
                                       standalone=standalone,
                                       ignore_ambiguity=ignore_ambiguity,
                                       snakemakepath=snakemakepath,
                                       lock=lock,
                                       unlock=unlock,
                                       cleanup_metadata=cleanup_metadata,
                                       force_incomplete=force_incomplete,
                                       ignore_incomplete=ignore_incomplete,
                                       latency_wait=latency_wait,
                                       benchmark_repeats=benchmark_repeats,
                                       verbose=verbose,
                                       notemp=notemp,
                                       nodeps=nodeps,
                                       jobscript=jobscript,
                                       timestamp=timestamp,
                                       greediness=greediness,
                                       no_hooks=no_hooks,
                                       overwrite_shellcmd=overwrite_shellcmd,
                                       config=config,
                                       config_args=config_args,
                                       keep_logger=True)
                success = workflow.execute(
                    targets=targets,
                    dryrun=dryrun,
                    touch=touch,
                    cores=cores,
                    nodes=nodes,
                    local_cores=local_cores,
                    forcetargets=forcetargets,
                    forceall=forceall,
                    forcerun=forcerun,
                    prioritytargets=prioritytargets,
                    quiet=quiet,
                    keepgoing=keepgoing,
                    printshellcmds=printshellcmds,
                    printreason=printreason,
                    printrulegraph=printrulegraph,
                    printdag=printdag,
                    cluster=cluster,
                    cluster_config=cluster_config,
                    cluster_sync=cluster_sync,
                    jobname=jobname,
                    drmaa=drmaa,
                    printd3dag=printd3dag,
                    immediate_submit=immediate_submit,
                    ignore_ambiguity=ignore_ambiguity,
                    stats=stats,
                    force_incomplete=force_incomplete,
                    ignore_incomplete=ignore_incomplete,
                    list_version_changes=list_version_changes,
                    list_code_changes=list_code_changes,
                    list_input_changes=list_input_changes,
                    list_params_changes=list_params_changes,
                    summary=summary,
                    latency_wait=latency_wait,
                    benchmark_repeats=benchmark_repeats,
                    wait_for_files=wait_for_files,
                    detailed_summary=detailed_summary,
                    nolock=not lock,
                    unlock=unlock,
                    resources=resources,
                    notemp=notemp,
                    nodeps=nodeps,
                    keep_target_files=keep_target_files,
                    cleanup_metadata=cleanup_metadata,
                    subsnakemake=subsnakemake,
                    updated_files=updated_files,
                    allowed_rules=allowed_rules,
                    greediness=greediness,
                    no_hooks=no_hooks)

    except BrokenPipeError:
        # ignore this exception and stop. It occurs if snakemake output is piped into less and less quits before reading the whole output.
        # in such a case, snakemake shall stop scheduling and quit with error 1
        success = False
    except (Exception, BaseException) as ex:
        print_exception(ex, workflow.linemaps)
        success = False
    if workdir:
        os.chdir(olddir)
    if workflow.persistence:
        workflow.persistence.unlock()
    if not keep_logger:
        logger.cleanup()
    return success
Example #29
0
    def schedule(self):
        """Schedule jobs that are ready, maximizing cpu usage."""
        try:
            while True:
                # work around so that the wait does not prevent keyboard interrupts
                # while not self._open_jobs.acquire(False):
                #    time.sleep(1)
                self._open_jobs.acquire()

                # obtain needrun and running jobs in a thread-safe way
                with self._lock:
                    self._finish_jobs()
                    self._error_jobs()
                    needrun = set(self.open_jobs)
                    running = list(self.running)
                    errors = self._errors
                    executor_error = self._executor_error
                    user_kill = self._user_kill

                # handle errors
                if user_kill or (not self.keepgoing and errors) or executor_error:
                    if user_kill == "graceful":
                        logger.info(
                            "Will exit after finishing currently running jobs (scheduler)."
                        )

                    if executor_error:
                        print_exception(executor_error, self.workflow.linemaps)

                    if executor_error or not running:
                        logger.info("Shutting down, this might take some time.")
                        self._executor.shutdown()
                        if not user_kill:
                            logger.error(_ERROR_MSG_FINAL)
                        return False
                    continue

                # all runnable jobs have finished, normal shutdown
                if not needrun and (not running or self.workflow.immediate_submit):
                    self._executor.shutdown()
                    if errors:
                        logger.error(_ERROR_MSG_FINAL)
                    # we still have unfinished jobs. this is not good. direct
                    # user to github issue
                    if self.remaining_jobs and not self.keepgoing:
                        logger.error(_ERROR_MSG_ISSUE_823)
                        logger.error(
                            "Remaining jobs:\n"
                            + "\n".join(
                                " - " + str(job) + ": " + ", ".join(job.output)
                                for job in self.remaining_jobs
                            )
                        )
                        return False
                    return not errors

                # continue if no new job needs to be executed
                if not needrun:
                    continue

                # select jobs by solving knapsack problem (omit with dryrun)
                if self.dryrun:
                    run = needrun
                else:
                    # Reset params and resources because they might still contain TBDs
                    # or old values from before files have been regenerated.
                    # Now, they can be recalculated as all input is present and up to date.
                    for job in needrun:
                        job.reset_params_and_resources()

                    logger.debug(
                        "Resources before job selection: {}".format(self.resources)
                    )
                    logger.debug(
                        "Ready jobs ({}):\n\t".format(len(needrun))
                        + "\n\t".join(map(str, needrun))
                    )

                    if not self._last_job_selection_empty:
                        logger.info("Select jobs to execute...")
                    run = self.job_selector(needrun)
                    self._last_job_selection_empty = not run

                    logger.debug(
                        "Selected jobs ({}):\n\t".format(len(run))
                        + "\n\t".join(map(str, run))
                    )
                    logger.debug(
                        "Resources after job selection: {}".format(self.resources)
                    )
                # update running jobs
                with self._lock:
                    self.running.update(run)
                    # remove from ready_jobs
                    self.dag.register_running(run)

                # actually run jobs
                local_runjobs = [job for job in run if job.is_local]
                runjobs = [job for job in run if not job.is_local]
                self.run(local_runjobs, executor=self._local_executor or self._executor)
                self.run(runjobs)
        except (KeyboardInterrupt, SystemExit):
            logger.info(
                "Terminating processes on user request, this might take some time."
            )
            self._executor.cancel()
            return False
Example #30
0
def samples2metadata_sra(samples: List[str], logger) -> dict:
    """
    Get the required info to continue a seq2science run from a list of samples.

    - If a sample already exists locally, we only want to know if it is paired-end or single-end.
    - If a sample does not exist locally
      - find its corresponding SRX number and all runs that belong to it,
      - check if they all have the same layout, if not, crash
      - see if we can download the runs from ena

    output:
        dict(
            "GSM1234": {"layout": "PAIRED",
                         "runs": ["SRR1234", "SRR4321"],
                         "ena_fastq_ftp": {...},

            "SRR5678": {"layout": "SINGLE",
                        "runs": ["SRR5678"],
                        ena_fastq_ftp: None,
            ...
        )
    """
    # start with empty dictionary which we fill out later
    SAMPLEDICT = {sample: dict() for sample in samples}

    # only continue with public samples
    db_sra = pysradb.SRAweb()

    # all samples that are on GEO (GSM numbers), must first be converted to SRA ids (SRX numbers)
    geo_samples = [sample for sample in samples if sample.startswith("GSM")]

    # in sample2clean we store the (potential GEO) sample name in a SRA compliant name
    if len(geo_samples):
        try:
            df_geo = db_sra.gsm_to_srx(geo_samples)
        except:
            logger.error(
                "We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers "
                "are overloaded or slow. Please try again in a bit...\n"
                "Another possible option is that you try to access samples that do not exist or are protected, and "
                "seq2science does not support downloading those..\n\n")
            os._exit(1)  # noqa

        sample2clean = dict(
            zip(df_geo.experiment_alias, df_geo.experiment_accession))
    else:
        sample2clean = dict()

    # now add the already SRA compliant names with a reference to itself
    sample2clean.update(
        {sample: sample
         for sample in samples if sample not in geo_samples})

    # check our samples on sra
    try:
        df_sra = db_sra.sra_metadata(list(sample2clean.values()),
                                     detailed=True)
    except:
        logger.error(
            "We had trouble querying the SRA. This probably means that the SRA was unresponsive, and their servers "
            "are overloaded or slow. Please try again in a bit...\n"
            "Another possible option is that you try to access samples that do not exist or are protected, and "
            "seq2science does not support downloading those..\n\n")
        os._exit(1)  # noqa

    # keep track of not-supported samples
    not_supported_formats = ["ABI_SOLID"]
    not_supported_samples = []

    for sample, clean in sample2clean.items():
        # table indices
        idxs = _sample_to_idxs(df_sra, clean)

        # get all runs that belong to the sample
        runs = df_sra.loc[idxs].run_accession.tolist()
        assert len(runs) >= 1
        SAMPLEDICT[sample]["runs"] = runs

        # check if sample is from a supported format
        for bad_format in not_supported_formats:
            for real_format in df_sra.loc[idxs].instrument_model_desc.tolist():
                if real_format == bad_format:
                    not_supported_samples.append(sample)

        # get the layout
        layout = df_sra.loc[idxs].library_layout.tolist()
        assert len(set(
            layout)) == 1, f"sample {sample} consists of mixed layouts, bad!"
        assert layout[0] in ["PAIRED", "SINGLE"
                             ], f"sample {sample} is an unclear layout, bad!"
        SAMPLEDICT[sample]["layout"] = layout[0]

        # get the ena url
        SAMPLEDICT[sample]["ena_fastq_ftp"] = dict()
        for run in runs:
            if layout[0] == "SINGLE":
                SAMPLEDICT[sample]["ena_fastq_ftp"][run] = df_sra[
                    df_sra.run_accession == run].ena_fastq_ftp.tolist()
            elif layout[0] == "PAIRED":
                SAMPLEDICT[sample]["ena_fastq_ftp"][run] = (
                    df_sra[df_sra.run_accession ==
                           run].ena_fastq_ftp_1.tolist() +
                    df_sra[df_sra.run_accession ==
                           run].ena_fastq_ftp_2.tolist())

        # if any run from a sample is not found on ENA, better be safe, and assume that sample as a whole is not on ENA
        if any([
                any(pd.isna(urls))
                for urls in SAMPLEDICT[sample]["ena_fastq_ftp"].values()
        ]):
            SAMPLEDICT[sample]["ena_fastq_ftp"] = None

    # now report single message for all sample(s) that are from a sequencing platform that is not supported
    assert len(not_supported_samples) == 0, (
        f'Sample(s) {", ".join(not_supported_samples)} are not supported by seq2science. Samples that are one of '
        f'these formats; [{", ".join(not_supported_formats)}] are not supported.'
    )

    return SAMPLEDICT
Example #31
0
def snakemake(snakefile,
    listrules=False,
    cores=1,
    resources=None,
    workdir=None,
    targets=None,
    dryrun=False,
    touch=False,
    forcetargets=False,
    forceall=False,
    forcerun=None,
    prioritytargets=None,
    stats=None,
    printreason=False,
    printshellcmds=False,
    printdag=False,
    printrulegraph=False,
    nocolor=False,
    quiet=False,
    keepgoing=False,
    cluster=None,
    immediate_submit=False,
    standalone=False,
    ignore_ambiguity=False,
    snakemakepath=None,
    lock=True,
    unlock=False,
    cleanup_metadata=None,
    force_incomplete=False,
    ignore_incomplete=False,
    list_version_changes=False,
    list_code_changes=False,
    list_input_changes=False,
    list_params_changes=False,
    summary=False,
    output_wait=3,
    print_compilation=False,
    debug=False,
    notemp=False,
    nodeps=False,
    jobscript=None,
    timestamp=False):
    """
    Run snakemake on a given snakefile.
    Note: at the moment, this function is not thread-safe!

    Arguments
    snakefile         -- the snakefile.
    list              -- list rules.
    jobs              -- maximum number of parallel jobs (default: 1).
    directory         -- working directory (default: current directory).
    rule              -- execute this rule (default: first rule in snakefile).
    dryrun            -- print the rules that would be executed,
        but do not execute them.
    forcethis         -- force the selected rule to be executed
    forceall          -- force all rules to be executed
    time_measurements -- measure the running times of all rules
    lock              -- lock the working directory
    """

    init_logger(nocolor=nocolor, stdout=dryrun, debug=debug, timestamp=timestamp)

    if not os.path.exists(snakefile):
        logger.error("Error: Snakefile \"{}\" not present.".format(snakefile))
        return False

    if workdir:
        olddir = os.getcwd()
    workflow = Workflow(
        snakefile=snakefile, snakemakepath=snakemakepath,
        jobscript=jobscript)

    if standalone:
        try:
            # set the process group
            os.setpgrp()
        except:
            # ignore: if it does not work we can still work without it
            pass

    success = True
    try:
        workflow.include(snakefile, workdir=workdir,
            overwrite_first_rule=True, print_compilation=print_compilation)
        workflow.check()

        if not print_compilation:
            if listrules:
                workflow.list_rules()
            else:
                if not printdag and not printrulegraph:
                    # handle subworkflows
                    subsnakemake = partial(
                        snakemake,
                        cores=cores,
                        resources=resources,
                        dryrun=dryrun,
                        touch=touch,
                        printreason=printreason,
                        printshellcmds=printshellcmds,
                        nocolor=nocolor,
                        quiet=quiet,
                        keepgoing=keepgoing,
                        cluster=cluster,
                        immediate_submit=immediate_submit,
                        standalone=standalone,
                        ignore_ambiguity=ignore_ambiguity,
                        snakemakepath=snakemakepath,
                        lock=lock,
                        unlock=unlock,
                        cleanup_metadata=cleanup_metadata,
                        force_incomplete=force_incomplete,
                        ignore_incomplete=ignore_incomplete,
                        output_wait=output_wait,
                        debug=debug,
                        notemp=notemp,
                        nodeps=nodeps,
                        jobscript=jobscript,
                        timestamp=timestamp)
                    for subworkflow in workflow.subworkflows:
                        logger.warning("Executing subworkflow {}.".format(subworkflow.name))
                        if not subsnakemake(subworkflow.snakefile, workdir=subworkflow.workdir, targets=subworkflow.targets):
                            success = False
                    if workflow.subworkflows:
                        logger.warning("Executing main workflow.")
                if success:
                    success = workflow.execute(
                        targets=targets, dryrun=dryrun, touch=touch,
                        cores=cores, forcetargets=forcetargets,
                        forceall=forceall, forcerun=forcerun,
                        prioritytargets=prioritytargets, quiet=quiet,
                        keepgoing=keepgoing, printshellcmds=printshellcmds,
                        printreason=printreason, printrulegraph=printrulegraph,
                        printdag=printdag, cluster=cluster,
                        immediate_submit=immediate_submit,
                        ignore_ambiguity=ignore_ambiguity,
                        workdir=workdir, stats=stats,
                        force_incomplete=force_incomplete,
                        ignore_incomplete=ignore_incomplete,
                        list_version_changes=list_version_changes,
                        list_code_changes=list_code_changes,
                        list_input_changes=list_input_changes,
                        list_params_changes=list_params_changes,
                        summary=summary,
                        output_wait=output_wait,
                        nolock=not lock,
                        unlock=unlock,
                        resources=resources,
                        notemp=notemp,
                        nodeps=nodeps,
                        cleanup_metadata=cleanup_metadata
                        )

    except (Exception, BaseException) as ex:
        print_exception(ex, workflow.linemaps)
        success = False
    if workdir:
        os.chdir(olddir)
    if workflow.persistence:
        workflow.persistence.unlock()
    return success
Example #32
0
def print_exception(ex, linemaps):
    """
    Print an error message for a given exception.

    Arguments
    ex -- the exception
    linemaps -- a dict of a dict that maps for each snakefile
        the compiled lines to source code lines in the snakefile.
    """
    tb = "Full " + "".join(traceback.format_exception(type(ex), ex, ex.__traceback__))
    logger.debug(tb)
    if isinstance(ex, SyntaxError) or isinstance(ex, IndentationError):
        logger.error(format_error(ex, ex.lineno,
                                  linemaps=linemaps,
                                  snakefile=ex.filename,
                                  show_traceback=True))
        return
    origin = get_exception_origin(ex, linemaps)
    if origin is not None:
        lineno, file = origin
        logger.error(format_error(ex, lineno,
                                  linemaps=linemaps,
                                  snakefile=file,
                                  show_traceback=True))
        return
    elif isinstance(ex, TokenError):
        logger.error(format_error(ex, None, show_traceback=False))
    elif isinstance(ex, MissingRuleException):
        logger.error(format_error(ex, None,
                                  linemaps=linemaps,
                                  snakefile=ex.filename,
                                  show_traceback=False))
    elif isinstance(ex, RuleException):
        for e in ex._include + [ex]:
            if not e.omit:
                logger.error(format_error(e, e.lineno,
                                          linemaps=linemaps,
                                          snakefile=e.filename,
                                          show_traceback=True))
    elif isinstance(ex, WorkflowError):
        logger.error(format_error(ex, ex.lineno,
                                  linemaps=linemaps,
                                  snakefile=ex.snakefile,
                                  show_traceback=True))
    elif isinstance(ex, KeyboardInterrupt):
        logger.info("Cancelling snakemake on user request.")
    else:
        traceback.print_exception(type(ex), ex, ex.__traceback__)
Example #33
0
 def print_job_error(self, job):
     logger.error("Error in job {} while creating output file{} {}.".format(
         job, "s" if len(job.output) > 1 else "", ", ".join(job.output)))
    def _wait_for_jobs(self):
        """wait for jobs to complete. This means requesting their status,
        and then marking them as finished when a "done" parameter
        shows up. Even for finished jobs, the status should still return
        """
        import googleapiclient

        while True:
            # always use self.lock to avoid race conditions
            with self.lock:
                if not self.wait:
                    return
                active_jobs = self.active_jobs
                self.active_jobs = list()
                still_running = list()

            # Loop through active jobs and act on status
            for j in active_jobs:

                # use self.status_rate_limiter to avoid too many API calls.
                with self.status_rate_limiter:

                    # https://cloud.google.com/life-sciences/docs/reference/rest/v2beta/projects.locations.operations/get
                    # Get status from projects.locations.operations/get
                    operations = self._api.projects().locations().operations()
                    request = operations.get(name=j.jobname)
                    logger.debug("Checking status for operation {}".format(
                        j.jobid))

                    try:
                        status = self._retry_request(request)
                    except googleapiclient.errors.HttpError as ex:

                        # Operation name not found, even finished should be found
                        if ex.status == 404:
                            j.error_callback(j.job)
                            continue

                        # Unpredictable server (500) error
                        elif ex.status == 500:
                            logger.error(ex["content"].decode("utf-8"))
                            j.error_callback(j.job)

                    except WorkflowError as ex:
                        print_exception(ex, self.workflow.linemaps)
                        j.error_callback(j.job)
                        continue

                    # The operation is done
                    if status.get("done", False) == True:

                        # Derive success/failure from status codes (prints too)
                        if self._job_was_successful(status):
                            j.callback(j.job)
                        else:
                            self.print_job_error(j.job, jobid=j.jobid)
                            j.error_callback(j.job)

                    # The operation is still running
                    else:
                        still_running.append(j)

            with self.lock:
                self.active_jobs.extend(still_running)
            sleep()
Example #35
0
    def execute(
        self, targets=None, dryrun=False,  touch=False, cores=1,
        forcetargets=False, forceall=False, forcerun=None,
        prioritytargets=None, quiet=False, keepgoing=False,
        printshellcmds=False, printreason=False, printdag=False,
        cluster=None, immediate_submit=False, ignore_ambiguity=False,
        workdir=None, printrulegraph=False,
        stats=None, force_incomplete=False, ignore_incomplete=False,
        list_version_changes=False, list_code_changes=False,
        list_input_changes=False, list_params_changes=False,
        summary=False, output_wait=3, nolock=False, unlock=False,
        resources=None, notemp=False, nodeps=False,
        cleanup_metadata=None):

        self.global_resources = dict() if cluster or resources is None else resources
        self.global_resources["_cores"] = cores

        def rules(items):
            return map(self._rules.__getitem__, filter(self.is_rule, items))

        def files(items):
            return map(os.path.relpath, filterfalse(self.is_rule, items))

        if workdir is None:
            workdir = os.getcwd() if self._workdir is None else self._workdir
        os.chdir(workdir)

        if not targets:
            targets = [self.first_rule] if self.first_rule is not None else list()
        if prioritytargets is None:
            prioritytargets = list()
        if forcerun is None:
            forcerun = list()

        priorityrules = set(rules(prioritytargets))
        priorityfiles = set(files(prioritytargets))
        forcerules = set(rules(forcerun))
        forcefiles = set(files(forcerun))
        targetrules = set(chain(
            rules(targets), filterfalse(Rule.has_wildcards, priorityrules),
            filterfalse(Rule.has_wildcards, forcerules)))
        targetfiles = set(chain(files(targets), priorityfiles, forcefiles))
        if forcetargets:
            forcefiles.update(targetfiles)
            forcerules.update(targetrules)

        dag = DAG(
            self, dryrun=dryrun, targetfiles=targetfiles,
            targetrules=targetrules,
            forceall=forceall, forcefiles=forcefiles,
            forcerules=forcerules, priorityfiles=priorityfiles,
            priorityrules=priorityrules, ignore_ambiguity=ignore_ambiguity,
            force_incomplete=force_incomplete,
            ignore_incomplete=ignore_incomplete, notemp=notemp)

        self.persistence = Persistence(nolock=nolock, dag=dag)

        if cleanup_metadata:
            for f in cleanup_metadata:
                self.persistence.cleanup_metadata(f)
            return True

        dag.init()
        dag.check_dynamic()

        if unlock:
            try:
                self.persistence.cleanup_locks()
                logger.warning("Unlocking working directory.")
                return True
            except IOError:
                logger.error("Error: Unlocking the directory {} failed. Maybe "
                "you don't have the permissions?")
                return False
        try:
            self.persistence.lock()
        except IOError:
            logger.critical("Error: Directory cannot be locked. Please make "
                "sure that no other Snakemake process is trying to create "
                "the same files in the following directory:\n{}\n"
                "If you are sure that no other "
                "instances of snakemake are running on this directory, "
                "the remaining lock was likely caused by a kill signal or "
                "a power loss. It can be removed with "
                "the --unlock argument.".format(os.getcwd()))
            return False

        dag.check_incomplete()
        dag.postprocess()

        if nodeps:
            missing_input = [f for job in dag.targetjobs for f in job.input if dag.needrun(job) and not os.path.exists(f)]
            logger.critical("Dependency resolution disabled (--nodeps) "
                "but missing input " 
                "files detected. If this happens on a cluster, please make sure "
                "that you handle the dependencies yourself or turn of "
                "--immediate-submit. Missing input files:\n{}".format(
                    "\n".join(missing_input)))
            
            return False

        if printdag:
            print(dag)
            return True
        elif printrulegraph:
            print(dag.rule_dot())
            return True
        elif summary:
            print("\n".join(dag.summary()))
            return True
        elif list_version_changes:
            items = list(chain(
                *map(self.persistence.version_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_code_changes:
            items = list(chain(
                *map(self.persistence.code_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_input_changes:
            items = list(chain(
                *map(self.persistence.input_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True
        elif list_params_changes:
            items = list(chain(
                *map(self.persistence.params_changed, dag.jobs)))
            if items:
                print(*items, sep="\n")
            return True

        scheduler = JobScheduler(
            self, dag, cores, dryrun=dryrun, touch=touch, cluster=cluster,
            immediate_submit=immediate_submit,
            quiet=quiet, keepgoing=keepgoing,
            printreason=printreason, printshellcmds=printshellcmds,
            output_wait=output_wait)

        if not dryrun and not quiet and len(dag):
            if cluster:
                logger.warning("Provided cluster nodes: {}".format(cores))
            else:
                logger.warning("Provided cores: {}".format(cores))
            logger.warning("\n".join(dag.stats()))

        success = scheduler.schedule()

        if success:
            if dryrun:
                if not quiet:
                    logger.warning("\n".join(dag.stats()))
            elif stats:
                scheduler.stats.to_csv(stats)
        else:
            logger.critical(
                "Exiting because a job execution failed. "
                "Look above for error message")
            return False
        return True