def inner(self, outfile, *args, **kwargs): try: f() except Exception as e: E.warn("received exception {} - touching {}".format( str(e), outfile)) IOTools.touch_file(outfile)
def run(self, infile, outfile, params): if params.reference_fasta is None: raise ValueError("please provide a reference database") statement = ( "{params.path_nucmer} -p {outfile} {params.reference_fasta} {infile} >& {outfile}.nucmer; " "{params.path_dnadiff} -p {outfile} -d {outfile}.delta >& {outfile}.dnadiff; " "{params.path_mummerplot} --large --fat --png {outfile}.1delta >& {outfile}.mummerplot" .format(**locals())) retval = P.run(statement) IOTools.touch_file(outfile) return retval
def ignore_task(self, infiles, outfiles, params): """return True if task should be ignored. This method will also create the output file(s). """ if self._ignore: m = str(outfiles) for ignore in IOTools.val2list(self._ignore): if ignore in m: E.warn("task {} will be ignored".format(self.__name__)) for f in IOTools.val2list(outfiles): E.info("creating empty file {}".format(f)) IOTools.touch_file(f) return True return False
def run(self, infiles, outfile, params): if not outfile.endswith("-pass.fastq.gz"): raise ValueError( "outfile must end in -pass.fastq.gz, got {}".format(outfile)) if params.min_size_bytes: before = len(infiles) infiles = [ x for x in infiles if os.path.getsize(x) >= params.min_size_bytes ] E.debug( "removing small files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if params.newer_than: before = len(infiles) cutoff = os.path.getmtime(params.newer_than) infiles = [x for x in infiles if os.path.getmtime(x) > cutoff] E.debug( "removing old files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if len(infiles) == 0: E.warn("no files left after filtering, creating empty file") IOTools.touch_file(outfile) return infiles = " ".join(infiles) outfile_fail = IOTools.snip(outfile, "-pass.fastq.gz") + "-fail.fastq.gz" statement = ("zcat {infiles} " "| daisy fastq2fastq " "--method=filter-ONT " "--min-average-quality={params.min_average_quality} " "--log={outfile}.log " "--min-length={params.min_length} " "--output-removed-fastq={outfile_fail} " "- " "| gzip " "> {outfile}".format(**locals())) return P.run(statement)
def run(self, infile, outfile, params): if params.reference_database is None: raise ValueError("please provide a reference database") statement = ( "{params.path_lastal} {params.lastal_options} " "{params.reference_database} {infile} " "| {params.path_lastsplit} {params.lastsplit_options} " "| {params.path_mafsort} " "| gzip " "> {outfile}.maf.gz; " "{params.path_lastdotplot} " "<(zcat {outfile}.maf.gz " "| daisy maf2maf --log={outfile}.filter.log --min-length={params.min_contig_length} ) " "{outfile}.png ".format(**locals())) retval = P.run(statement, job_memory="15G") IOTools.touch_file(outfile) return retval
def touch_and_mark_as_mounted(source, dest): o = os.stat(source) IOTools.touch_file(dest, times=(o.st_atime, o.st_mtime)) with open(dest + ".mnt", "w") as outf: outf.write(get_mounted_location(source))
def __call__(self, infiles, outfile, only_info=False): # NOTE: extras not implemented in ruffus 2.6.3, thus # use parameter: only_info = "only_info" in P.PARAMS # ensure output directory exists. # This should be done on the pipeline level, but # ruffus currently seems not to allow this. outdir = os.path.dirname(outfile) if outdir and not os.path.exists(outdir): os.makedirs(outdir) output_files = [ self.map_table_to_file(x, outfile) for x in self.tablenames ] kwargs = { 'output_files': output_files, 'input_files': infiles, 'outdir': outdir } if self._runtime_regex: kwargs["alias"] = self.build_alias(str(infiles), regex=self._runtime_regex, alias=self._runtime_alias) self.save_meta(outfile, **kwargs) if self.ignore: found = False for i in self.ignore: if i in outdir: found = True break if found: E.warn("skipping task {} at runtime, an empty file is created". format(outfile)) IOTools.touch_file(outfile) return # if self.runtime_filter: # TODO: create empty outfile if regex matches # pass if only_info: E.warn( "only_info - meta information in {} has been updated".format( IOTools.snip(outfile) + ".info")) return # AH: duplicated from above? params = self.build_params(output_files=output_files) on_error_options = ["raise", "ignore"] on_error = params.get("on_error", "raise") if on_error not in on_error_options: raise ValueError("unknown option to 'on_error': '{}' " "should be one of '{}'".format( on_error, ",".join(on_error_options))) if self.ignore_task(infiles, outfile, params): return # deal with placeholder files created by identity that are # located on a remote mount point def map_to_mount(fn): if os.path.exists(fn + ".mnt"): if not P.PARAMS["mount_point"]: raise ValueError( "encountered mounted file {}, but no mount point present" .format(fn)) with open(fn + ".mnt") as inf: mount_path = inf.read() return os.path.join(P.PARAMS["mount_point"], mount_path) else: return fn # replace infiles with mount locations if necessary if isinstance(infiles, list): infiles = [map_to_mount(x) for x in infiles] else: infiles = map_to_mount(infiles) try: benchmark = self.run(infiles, outfile, as_namedtuple(params)) except Exception as ex: on_error = params.get("on_error", "raise") if on_error == "raise": raise elif on_error == "ignore": E.warn( "error occured during execution of {} but will be ignored:\n{}" .format(self.__name__, ex)) E.warn( "an empty output file {} will be created.".format(outfile)) IOTools.touch_file(outfile) benchmark = None if benchmark: self.save_benchmark(outfile, benchmark)
def run(self, infile, outfile, params): # TODO: bam_fastqc_sequence_length_distribution.tsv may # contain ranges such as '30-31'. Convert to beginning of # range like in this perl command: # # perl -p -i -e "s/\-\d+//" # *.dir/bam_fastqc.dir/bam_fastqc.tsv.bam_fastqc_sequence_length_distribution.tsv if infile.endswith(".gz"): prefix = IOTools.snip(os.path.basename(infile[:-3])) else: prefix = IOTools.snip(os.path.basename(infile)) outdir = os.path.dirname(outfile) datafile = os.path.join(outdir, "{}_fastqc".format(prefix), "fastqc_data.txt") if not os.path.exists(datafile): if not os.path.exists(outdir): os.makedirs(outdir) retval = P.run( "{params.path} " "{params.options} " "--extract " "--outdir {outdir} " "{infile} " ">& {outfile} ".format(**locals()), **params._asdict()) else: IOTools.touch_file(outfile) retval = None def _split_output(lines): body, header, section, status = [], None, None, None for line in lines: if line.startswith("##FastQC"): continue elif line.startswith("#"): header, body = line[1:-1].split("\t"), [] elif line.startswith(">>END_MODULE"): yield section, header, body, status body, header, section, status = [], None, None, None elif line.startswith(">>"): section, status = line[2:-1].split("\t") else: fields = line[:-1].split("\t") body.append(fields) # split into separate files for upload summary_data = [] with IOTools.open_file(datafile) as inf: for section, header, body, status in _split_output(inf): if len(body) == 0: continue summary_data.append((section, status)) tablename = "{}_".format(self.name) + re.sub( " ", "_", section).lower() if tablename not in self.tablenames: raise ValueError( "unknown tablename {}, expected one of {}".format( tablename, self.tablenames)) output_file = ".".join((outfile, tablename, "tsv")) with open(output_file, "w") as outf: outf.write("\t".join([x.lower() for x in header]) + "\n") # remove first column, which contains the identifier outf.write("\n".join(["\t".join(x) for x in body]) + "\n") output_file = ".".join( (outfile, "{}_summary".format(self.name), "tsv")) with IOTools.open_file(output_file, "w") as outf: outf.write("section\tstatus\n") for section, status in summary_data: outf.write("{}\t{}\n".format(section, status)) return retval