def extract_wildcards(pattern, target): """ Return a dictionary of wildcards and values identified from `target`. Returns None if the regex match failed. Parameters ---------- pattern : str Snakemake-style filename pattern, e.g. ``{output}/{sample}.bam``. target : str Filename from which to extract wildcards, e.g., ``data/a.bam``. Examples -------- >>> pattern = '{output}/{sample}.bam' >>> target = 'data/a.bam' >>> expected = {'output': 'data', 'sample': 'a'} >>> assert extract_wildcards(pattern, target) == expected >>> assert extract_wildcards(pattern, 'asdf') is None """ m = re.compile(regex(pattern)).match(target) if m: return m.groupdict()
def listfiles(pattern, restriction=None, omit_value=None): """ Yield a tuple of existing filepaths for the given pattern. Wildcard values are yielded as the second tuple item. Arguments pattern -- a filepattern. Wildcards are specified in snakemake syntax, e.g. "{id}.txt" """ pattern = os.path.normpath(pattern) first_wildcard = re.search("{[^{]", pattern) if first_wildcard: dirname = os.path.dirname(pattern[:first_wildcard.start()]) if not dirname: dirname = "." else: dirname = os.path.dirname(pattern) pattern = re.compile(regex(pattern)) for dirpath, dirnames, filenames in os.walk(dirname): for f in chain(filenames, dirnames): if dirpath != ".": f = os.path.join(dirpath, f) match = re.match(pattern, f) if match and len(match.group()) == len(f): wildcards = Namedlist(fromdict=match.groupdict()) if restriction is not None: invalid = any(omit_value not in v and v != wildcards[k] for k, v in restriction.items()) if not invalid: yield f, wildcards else: yield f, wildcards
def listfiles(pattern, restriction=None, omit_value=None): """ Yield a tuple of existing filepaths for the given pattern. Wildcard values are yielded as the second tuple item. Arguments pattern -- a filepattern. Wildcards are specified in snakemake syntax, e.g. "{id}.txt" """ pattern = os.path.normpath(pattern) first_wildcard = re.search("{[^{]", pattern) if first_wildcard: dirname = os.path.dirname(pattern[:first_wildcard.start()]) if not dirname: dirname = "." else: dirname = os.path.dirname(pattern) pattern = re.compile(regex(pattern)) for dirpath, dirnames, filenames in os.walk(dirname): for f in chain(filenames, dirnames): if dirpath != ".": f = os.path.join(dirpath, f) match = re.match(pattern, f) if match and len(match.group()) == len(f): wildcards = Namedlist(fromdict=match.groupdict()) if restriction is not None: invalid = any( omit_value not in v and v != wildcards[k] for k, v in restriction.items()) if not invalid: yield f, wildcards else: yield f, wildcards
def get_fns_analysis(wildcards): fns = [] re_fn = re.compile(regex(str(source_pattern))) for fn in source_fkt(wildcards): match = re.match(re_fn, fn).groupdict() pattern = strip_wildcard_constraints(str(target_pattern)) fns.append(expand(pattern, **match, **extra_wildcards, allow_missing=True)[0]) return fns
def regex(self): if self._regex is None: # compile a regular expression; we remove the $ at end pattern = regex(self.file)[:-1] self._regex = re.compile(pattern) self._groupdict = {k:None for k in self._regex.groupindex.keys()} if any([k not in self.keys() for k in self._required_keys]): raise MissingRequiredKeyException( """some of the required keys {reqkeys} not in regexp {regexp}""".format( reqkeys=",".join(self._required_keys), regexp=self._regex)) return self._regex
def _compile(self, level): """ Use snakemake regex to compile regex from format string. Snakemake provides a nice regex function that converts format strings to regex. To help I also add sampleTable values to the regex to limit the results. Parameters ---------- level: str A string consiting of rawLevel, runLevel, sampleLevel, aggLevel Returns ------- str regex pattern generated by snakemake.io.regex Example ------- >>> SH = SampleHandler(test_config) >>> level = 'runLevel' >>> pattern = SH.config[level] >>> pattern 'pasilla_sample/{sampleID}/{sampleID}_{treatment}_{replicate}_R1' >>> assert SH._compile(level) == ( ... 'pasilla_sample\\\\/(?P<sampleID>treated1|treated2|untreated1' ... '|untreated2)\\\\/(?P=sampleID)_(?P<treatment>treated|untreated)' ... '_(?P<replicate>1|2)_R1' ... ) """ pattern = self.config[level] for name, values in self.sampleTable.reset_index().to_dict('list').items(): # Subsitute the first instance of each sampleTable column name and # add the unique list of column values. This will help narrow down # regex. NOTE: This may not be needed, but thought it might be useful. pattern = re.sub( '{{{name}}}'.format(name=name), '{' + '{name}, {res}'.format( name=name, res='|'.join(sorted(set(values))) ) + '}', pattern, count=1) # Retrun regex removing the '$' off of the end to allow partial matches return regex(pattern)[:-1]
def get_checkpoint_ids(self, stack, mygroup, target): if len(self.checkpoints) > 1: raise RuntimeError("Multiple checkpoints not implemented") from snakemake.workflow import checkpoints from snakemake.io import regex wildcards = re.match(regex(self._wildcards(self.name, {'field': 'output'})), stack.path).groupdict() checkpoint_name = next(iter(self.checkpoints.keys())) checkpoint = getattr(checkpoints, checkpoint_name) mytargets = self.get_ids(stack, [g for g in stack.group if g != stack], mygroup, target) bins = set() for mytarget in mytargets: wildcards['target'] = mytarget job = checkpoint.get(**wildcards) with open(job.output.bins, "r") as fd: bins.update(line.strip() for line in fd.readlines()) return list(bins)
def listfiles(pattern, restriction=None, omit_value=None): """Yield a tuple of existing filepaths for the given pattern. Wildcard values are yielded as the second tuple item. Args: pattern (str): a filepattern. Wildcards are specified in snakemake syntax, e.g. "{id}.txt" restriction (dict): restrict to wildcard values given in this dictionary omit_value (str): wildcard value to omit Yields: tuple: The next file matching the pattern, and the corresponding wildcards object """ pattern = os.path.normpath(pattern) first_wildcard = re.search("{[^{]", pattern) if first_wildcard: dirname = os.path.dirname(pattern[: first_wildcard.start()]) if not dirname: dirname = "." else: dirname = os.path.dirname(pattern) pattern = re.compile(regex(pattern)) for dirpath, dirnames, filenames in os.walk(dirname): for f in chain(filenames, dirnames): if dirpath != ".": f = os.path.normpath(os.path.join(dirpath, f)) match = re.match(pattern, f) if match: wildcards = Namedlist(fromdict=match.groupdict()) if restriction is not None: invalid = any( omit_value not in v and v != wildcards[k] for k, v in restriction.items() ) if not invalid: yield f, wildcards else: yield f, wildcards
def listfiles(pattern, restriction=None, omit_value=None): """Yield a tuple of existing filepaths for the given pattern. Wildcard values are yielded as the second tuple item. Args: pattern (str): a filepattern. Wildcards are specified in snakemake syntax, e.g. "{id}.txt" restriction (dict): restrict to wildcard values given in this dictionary omit_value (str): wildcard value to omit Yields: tuple: The next file matching the pattern, and the corresponding wildcards object """ pattern = os.path.normpath(pattern) first_wildcard = re.search("{[^{]", pattern) if first_wildcard: dirname = os.path.dirname(pattern[:first_wildcard.start()]) if not dirname: dirname = "." else: dirname = os.path.dirname(pattern) pattern = re.compile(regex(pattern)) for dirpath, dirnames, filenames in os.walk(dirname): for f in chain(filenames, dirnames): if dirpath != ".": f = os.path.normpath(os.path.join(dirpath, f)) match = re.match(pattern, f) if match: wildcards = Namedlist(fromdict=match.groupdict()) if restriction is not None: invalid = any(omit_value not in v and v != wildcards[k] for k, v in restriction.items()) if not invalid: yield f, wildcards else: yield f, wildcards
def glob_wildcards(pattern, files=None): """ Glob the values of the wildcards by matching the given pattern to the filesystem. Returns a named tuple with a list of values for each wildcard. """ from snakemake.io import _wildcard_regex, namedtuple, regex import regex as re pattern = os.path.normpath(pattern) first_wildcard = re.search("{[^{]", pattern) dirname = os.path.dirname(pattern[:first_wildcard.start()] ) if first_wildcard else os.path.dirname(pattern) if not dirname: dirname = "." names = [ match.group('name') for match in _wildcard_regex.finditer(pattern) ] Wildcards = namedtuple("Wildcards", names) wildcards = Wildcards(*[list() for name in names]) pattern = regex(pattern) # work around partial matching bug in python regex module # by replacing matches for "\" with "[/\0]" (0x0 can't occur in filenames) pattern = re.sub('\\\\/', '[/\0]', pattern) cpattern = re.compile(pattern) def walker(dirname, pattern): """finds files/dirs matching `pattern` in `dirname`""" for dirpath, dirnames, filenames in os.walk(dirname): dirpath = os.path.normpath(dirpath) for f in filenames: if dirpath != ".": f = os.path.join(dirpath, f) match = pattern.match(f) if match: yield match for i in range(len(dirnames) - 1, -1, -1): d = dirnames[i] if dirpath != ".": d = os.path.join(dirpath, d) match = pattern.match(os.path.join(d, ""), partial=True) if not match: del dirnames[i] continue if match.partial: continue yield match print("searching {}".format(pattern)) if files is None: for match in walker(dirname, cpattern): for name, value in match.groupdict().items(): getattr(wildcards, name).append(value) else: for f in files: match = re.match(cpattern, os.normpath(f)) if match: for name, value in match.groupdict().items(): getattr(wildcards, name).append(value) print("searching {}: done".format(pattern)) return wildcards