def run(self, infile, outfile, params): try: retval = P.run("{params.path} view -H " "{infile} " "2> {outfile}.log " "> {outfile}.tmp; ".format(**locals())) except OSError as e: E.warn("input file {} gave the following errors: {}".format( infile, str(e))) with open(outfile, "w") as outf, open(outfile + ".tmp") as inf: outf.write("header_tag\ttag\tlineno\tvalue\n") for lineno, line in enumerate(inf): fields = line[1:-1].split("\t") header_tag = fields[0] if header_tag == "CO": # Do not split comment lines outf.write("\t".join((header_tag, "", str(lineno), "\t".join(fields[1:]))) + "\n") else: for field in fields[1:]: sub_tag, content = field.split(":", 1) outf.write("\t".join((header_tag, sub_tag, str(lineno), content)) + "\n") os.unlink(outfile + ".tmp") return retval
def inner(self, outfile, *args, **kwargs): try: f() except Exception as e: E.warn("received exception {} - touching {}".format( str(e), outfile)) IOTools.touch_file(outfile)
def __call__(self, infiles, outfile, only_info=False): # NOTE: extras not implemented in ruffus 2.6.3, thus # use parameter: only_info = "only_info" in P.PARAMS self.input_files = collect_file_meta_information({"in": infiles}) self.input_alias = self.build_alias(str(infiles), regex=self._input_regex, alias=self._input_alias) is_empty_outfile = outfile == [] # patch for missing outfiles when number of outfiles is not # known (is it a ruffus thing?) if is_empty_outfile: assert isinstance(infiles, str) outdir = "{}/{}.dir".format(os.path.dirname(infiles), self.name) outfile = os.path.join(outdir, self.output) # dummy.info gets removed and then added self.save_meta(os.path.join(outdir, "dummy.info")) else: self.save_meta(outfile) if only_info: E.warn( "only_info - meta information in {} has been updated".format( self.build_meta_filename(outfile, "benchmark.info"))) return params = self.build_params() benchmark = self.run(infiles, outfile, as_namedtuple(params)) if not is_empty_outfile: self.save_benchmark(outfile, benchmark)
def run(self, outfile, params): bam = resolve_argument(params.bam, sep=" ") reference_fasta = get_reference(params) # warning: requires -m or -c in the options if "--multiallelic-caller" not in params.options and \ "-m" not in params.options and \ "-c" not in params.options and \ "--consensus-caller" not in params.options: E.warn("bcftools call requires -m or -c, got {}".format( params.options)) # limit number of jobs to node to limit I/O job_threads = 4 return P.run("{params.path_samtools} mpileup " "-ug " "-f {reference_fasta} " "{params.samtools_options} " "{bam} " "2> {outfile}.pileup.log " "| {params.path} call " "--variants-only " "--output-type z " "{params.options} " "2> {outfile}.call.log " "> {outfile}; " "tabix -p vcf {outfile} ".format(**locals()))
def get_reference_for_bam(bamfile, fastafiles): """deduce reference sequence used within BAM files. This method compares the sequence dictionary in the bamfile with a list of fastafiles. The comparison will stop at the first match that is found. :param bamfile: :term:`BAM` formatted file :param fastafiles: list of :term:`fasta` formatted files. The fasta files need to indexed with samtools faidx. :return: a tuple (filename, diffs). The first is the filename if found, otherwise None. If not found, diffs is a list of all input files with a list missing contigs or length mismatches. """ diffs = [] # Temporary fix: see issue SYS-517 if not os.path.exists(bamfile): E.warn("could not find file {}".format(bamfile)) try: with pysam.AlignmentFile(bamfile, check_sq=False) as inf: sequence_dict = dict(list(zip(inf.references, inf.lengths))) except IOError as ex: E.warn("could not open bamfile {}: {}".format(bamfile, ex)) return None, None fastafn, diffs = match_sequence_dictionaries(sequence_dict, fastafiles) return fastafn, diffs
def save_benchmark(self, outfile, benchmark): if not isinstance(benchmark, list): benchmark = [benchmark] # flatten if nested list and remove None benchmark = [ x for x in IOTools.flatten(benchmark, ltypes=(list, )) if x is not None ] filename = self.build_meta_filename(outfile, "benchmark.bench") if not benchmark: E.warn("could not save benchmark info to {}".format(filename)) return try: header = benchmark[0]._fields except AttributeError as ex: E.warn("could not save benchmark timings for {}:" " {} from {}".format(outfile, str(ex), str(benchmark[0]))) return with open(filename, "w") as outf: outf.write("\t".join(header) + "\n") for b in benchmark: outf.write("\t".join(map(str, b)) + "\n")
def run(self, infile, outfiles, params): tbxfile = pysam.VariantFile(infile) statements = [] for chrom in list(tbxfile.header.contigs): output_file = outfiles.format(chrom) output_dir = os.path.dirname(output_file) statements.append( "mkdir {output_dir}; " "tabix -h {infile} {chrom} | bgzip > {output_file}; " "tabix -p vcf {output_file} ".format(**locals())) retvals = P.run(statements) # clean up empty vcfs, opening empty VCF in pysam throws # ValueError for chrom in list(tbxfile.header.contigs): output_file = outfiles.format(chrom) output_dir = os.path.dirname(output_file) try: f = pysam.VariantFile(output_file) f.close() except ValueError: E.warn("removing empty VCF {}".format(output_file)) shutil.rmtree(output_dir) tbxfile.close()
def run(self, outfile, params): bam = resolve_argument(params.bam) reference_fasta = get_reference(params) stmnts = [] prefix = IOTools.snip(outfile, ".vcf.gz") vcf_output = prefix + ".raw.vcf.gz" if not os.path.exists(vcf_output): stmnts.append("java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type HaplotypeCaller " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {outfile}.HaplotypeCaller.log " "{params.haplotypecaller} " "--out {vcf_output} " ">& {prefix}.HaplotypeCaller.err".format(**locals())) else: E.warn("output file {vcf_output} already exists - " "it will not be recomputed".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, vcf_output, params)) return self.run_statements(stmnts, job_memory="5G")
def _test_task_will_run(self, taskf): # check if task is installed version = get_task_version(taskf) if version is None: self.skipTest("tools for task {} not available".format(taskf.name)) return # define input/output files tool_config = self.test_config["tool"] input_files = {} for expected in taskf.expected: if taskf.name in tool_config: # task specific input files p = tool_config[taskf.name].get( expected, tool_config.get(expected, None)) else: # generic input files p = tool_config.get(expected, None) if p is None: self.skipTest( "data for input slot {} not provided for {}".format( expected, taskf.name)) return input_files[expected] = p tmpdir = tempfile.mkdtemp(dir=".", prefix="tmp_{}".format(taskf.name)) if isinstance(taskf.output, str): outfile = os.path.join(tmpdir, taskf.output) else: outfile = [os.path.join(tmpdir, x) for x in taskf.output] # instantiate task t = taskf() # set custom options for test if taskf.name in tool_config: for key, value in tool_config[taskf.name].items(): setattr(t, key, value) # run task t.register_input(input_files) t(input_files.values(), outfile) # check if tool produced non-zero output if isinstance(outfile, list): for x in outfile: self.assertTrue(os.path.exists(x)) self.assertGreater(os.path.getsize(x), 0) else: self.assertTrue(os.path.exists(outfile)) self.assertGreater(os.path.getsize(outfile), 0) # cleanup try: shutil.rmtree(tmpdir) except OSError as ex: E.warn("could not remove {}: {}".format(tmpdir, ex))
def run(self, infile, outfile, params): if "reference_fasta" in params._fields: reference_fasta = "REFERENCE_SEQUENCE={}".format( params.reference_fasta) else: reference_fasta = "" # command can fail when no output is produced, but still produce output # 12G is required for java overhead retval = P.run("java -Xmx8000m -jar {params.path} " "CollectMultipleMetrics " "{reference_fasta} " "INPUT={infile} " "TMP_DIR=%(tmpdir)s " "{params.options} " "OUTPUT={outfile} " ">& {outfile} ".format(**locals()), job_memory="12G", ignore_errors=True) def get_section(section, data): pattern = "## {}".format(section) keep = False result = [] for line in data: if line.startswith("##"): if line.startswith(pattern): keep = True else: keep = False if keep: result.append(line) return result for tablename in self.tablenames: filename = re.sub("histogram", "metrics", tablename) raw = filename[len("picard_"):] src = outfile + "." + raw dest = outfile + "." + tablename + ".tsv" if not os.path.exists(src): E.warn("no file {}, ignored".format(src)) continue with IOTools.open_file(src) as inf: data = inf.readlines() if tablename.endswith("metrics"): data = get_section("METRICS", data) elif tablename.endswith("histogram"): data = get_section("HISTOGRAM", data) with IOTools.open_file(dest, "w") as outf: outf.write("".join(data)) return retval
def run(self, outfile, params): prefix = IOTools.snip(outfile, ".vcf.gz") bam = resolve_argument(params.bam, sep=",") reference_fasta = get_reference(params) bam = " ".join(["--input_file {}".format(x) for x in bam.split(",")]) stmnts = [] if not os.path.exists(prefix + ".annotated.vcf.gz"): tmpfile, pre_statement, post_statement = self.pre_process( params.vcf, outfile, params) stmnts.append(pre_statement) stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type VariantAnnotator " "--variant {tmpfile} " "{bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {prefix}.VariantAnnotator.log " "--annotation FisherStrand " "--annotation StrandOddsRatio " "--annotation ReadPosRankSumTest " "--annotation RMSMappingQuality " "--annotation MappingQualityRankSumTest " "{params.options} " "--out {prefix}.annotated.vcf.gz " ">& {prefix}.VariantAnnotator.err".format(**locals())) stmnts.extend( self.build_calibration_workflow(outfile, prefix, prefix + ".annotated.vcf.gz", params)) stmnts.append(post_statement) else: E.warn("using pre-existing file {} with annotated variants".format( prefix + ".annotated.vcf.gz")) stmnts.extend( self.build_calibration_workflow(outfile, prefix, prefix + ".annotated.vcf.gz", params)) return self.run_statements(stmnts, job_memory="3G")
def ignore_task(self, infiles, outfiles, params): """return True if task should be ignored. This method will also create the output file(s). """ if self._ignore: m = str(outfiles) for ignore in IOTools.val2list(self._ignore): if ignore in m: E.warn("task {} will be ignored".format(self.__name__)) for f in IOTools.val2list(outfiles): E.info("creating empty file {}".format(f)) IOTools.touch_file(f) return True return False
def run(self, infile, outfile, params): with open(outfile, "w") as outf: outf.write("chromosome\tsize\tmapped\tunmapped\n") try: retval = P.run("{params.path} idxstats " "{infile} " "2> {outfile}.log " ">> {outfile}; ".format(**locals())) except OSError as e: E.warn("input file {} gave the following errors: {}".format( infile, str(e))) retval = None return retval
def run(self, infile, outfile, params): with open(outfile, "w") as outf: outf.write("counts\tcounts_fail\tcategory\n") try: retval = P.run("{params.path} flagstat " "{infile} " "2> {outfile}.log " "| perl -p -e 's/ \+ /\\t/; s/ /\\t/; s/\\(.*//' " ">> {outfile}; ".format(**locals())) except OSError as e: E.warn("input file {} gave the following errors: {}".format( infile, str(e))) return retval
def _test_task_will_run(self, taskf): # check if task is installed version = get_task_version(taskf) if version is None: self.skipTest("tools for task {} not available".format(taskf.name)) return # define input/output files metric_config = self.test_config["metric"] infiles = None for key, values in metric_config.items(): if key == taskf.name: infiles = values.get("files", None) elif "patterns" in values: for pattern in values["patterns"]: if re.search(pattern, taskf.name): infiles = values.get("files", None) break if infiles: break if infiles is None: self.skipTest("no input files specified for {}".format(taskf.name)) return tmpdir = tempfile.mkdtemp(dir=".", prefix="tmp_{}_".format(taskf.name)) outfile = os.path.join(tmpdir, "output.tsv") # instantiate task task = taskf() # set custom options for test if taskf.name in metric_config: for key, value in metric_config[taskf.name].items(): setattr(task, key, value) # run task task(infiles, outfile) # check if tool produced non-zero output self.assertTrue(os.path.exists(outfile)) self.assertGreater(os.path.getsize(outfile), 0) # cleanup try: shutil.rmtree(tmpdir) except OSError as ex: E.warn("could not remove {}: {}".format(tmpdir, ex))
def run(self, infiles, outfile, params): if not outfile.endswith("-pass.fastq.gz"): raise ValueError( "outfile must end in -pass.fastq.gz, got {}".format(outfile)) if params.min_size_bytes: before = len(infiles) infiles = [ x for x in infiles if os.path.getsize(x) >= params.min_size_bytes ] E.debug( "removing small files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if params.newer_than: before = len(infiles) cutoff = os.path.getmtime(params.newer_than) infiles = [x for x in infiles if os.path.getmtime(x) > cutoff] E.debug( "removing old files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if len(infiles) == 0: E.warn("no files left after filtering, creating empty file") IOTools.touch_file(outfile) return infiles = " ".join(infiles) outfile_fail = IOTools.snip(outfile, "-pass.fastq.gz") + "-fail.fastq.gz" statement = ("zcat {infiles} " "| daisy fastq2fastq " "--method=filter-ONT " "--min-average-quality={params.min_average_quality} " "--log={outfile}.log " "--min-length={params.min_length} " "--output-removed-fastq={outfile_fail} " "- " "| gzip " "> {outfile}".format(**locals())) return P.run(statement)
def __call__(self, dataframe, map_sample2label={}): df = dataframe.pivot(index="gc_bin", columns="sample", values="mean").reset_index() # remove duplicate sample names # (in Cancer analysis: two blood samples) to_drop = [x for x in df.columns if x.startswith("2:")] df.drop(to_drop, axis=1, inplace=True) df.colums = [map_sample2label.get(x, x) for x in df.columns] if df.empty: E.warn("no data, no plot will be output") return ax = df.plot(kind="line", x="gc_bin") return ax
def get_sequence_length_dict(fastafn): """return sequence/length dictionary from a fasta file. The fasta file needs to be indexed with samtools faidx. """ # Temporary fix: see issue SYS-517 if not os.path.exists(fastafn): E.warn("could not find file {}".format(fastafn)) return None try: with pysam.FastaFile(fastafn) as inf: fastadict = dict(list(zip(inf.references, inf.lengths))) except IOError as ex: E.warn("file {} could not be opened".format(ex)) fastadict = None return fastadict
def __call__(self, track, column_is_norm=None, *args, **kwargs): fn = track table = pandas.read_csv( fn, comment="#", sep="\t", dtype={ "CHROM": object }, ).set_index(["CHROM", "POS"]) columns = table.columns if len(columns) > 2: E.warn("too many columns {}".format(columns)) if column_is_norm: if len(columns) == 2: num, den = columns if num in column_is_norm: num, den = den, num else: den = [x for x in columns if x in column_is_norm][0] num = [x for x in columns if x not in column_is_norm][0] if den not in column_is_norm: raise ValueError( "denominator is {}, but not norm, cols = {}".format( den, columns)) table = table[(table[num] >= self.min_depth) & (table[den] >= self.min_depth)] for column in columns: table[column] = table[column] / table[column].median() table["sample"] = num table["copy number"] = 2.0 * table[num] / table[den] table.drop(columns, axis=1, inplace=True) return table
def __call__(self, infiles, outfile, only_info=False): # NOTE: extras not implemented in ruffus 2.6.3, thus # use parameter: only_info = "only_info" in P.PARAMS if self.mountpoint: # revert mount redirection for arvados to allow redirection # on individual cluster nodes for d, key, value in IOTools.nested_iter(infiles): d[key] = re.sub(self.mountpoint, "arv=", value) self.instantiate_input(infiles) self.save_meta(outfile, output_file=outfile) if only_info: E.warn("only_info - meta information has been updated") return params = self.build_params(output_file=outfile) benchmark = self.run(outfile, as_namedtuple(params)) self.save_benchmark(outfile, benchmark)
def __call__(self, infiles, outfile, only_info=False): # NOTE: extras not implemented in ruffus 2.6.3, thus # use parameter: only_info = "only_info" in P.PARAMS self.input_files = collect_file_meta_information({"in": infiles}) self.input_alias = self.build_alias(str(infiles), regex=self._input_regex, alias=self._input_alias) if isinstance(outfile, list): outdir = [os.path.dirname(x) for x in outfile] basefile = os.path.commonprefix(outfile) else: outdir = os.path.dirname(outfile) basefile = outfile kwargs = {'output_file': outfile, 'input_files': infiles, 'outdir': outdir} self.save_meta(outfile, **kwargs) if only_info: E.warn( "only_info - meta information in {} has been updated".format( os.path.join(os.path.dirname(basefile), "benchmark.info"))) return params = self.build_params() benchmark = self.run(infiles, outfile, as_namedtuple(params)) self.save_benchmark( basefile, benchmark)
def run(self, infile, outfile, params): options = [] reference_fasta = params.reference_fasta reference_fasta_map = build_reference_fasta_map( params.reference_fasta_map) reference_label = None use_target_regions = True if params.reference_fasta: map_path2name = dict([(x[1], x[0]) for x in list(reference_fasta_map.items())]) if params.reference_fasta == "auto": fasta = resolve_argument(list(reference_fasta_map.values()), ",").split(",") reference_fasta, diffs = get_reference_for_bam( infile, fastafiles=fasta) if reference_fasta: options.append("--ref-seq {}".format(reference_fasta)) reference_label = map_path2name[reference_fasta] elif diffs: E.warn( "attempted to detect reference fasta, but unable to do so. " "diffs: {}".format(diffs)) else: E.warn("sequence dict is empty, BAM likely to be empty. " "target_regions will be ignored") use_target_regions = False else: options.append("--ref-seq {}".format(params.reference_fasta)) reference_label = map_path2name.get(params.reference_fasta, None) if params.target_regions and use_target_regions: target_regions = get_associated_file(params, reference_label, "target_regions") # convert to 1-based coordinates and decompress if target_regions.endswith(".bed.gz"): target_regions = ( "<(zcat {} " "| awk '{{printf(\"%%s\\t%%i\\t%%i\\n\", $1, $2+1, $3)}}')" .format(target_regions)) options.append("--target-regions {}".format(target_regions)) options = " ".join(options) if not os.path.exists(outfile + ".tmp"): try: retval = P.run("{params.path} stats " "{self.options} " "{options} " "{infile} " "2> {outfile}.log " "> {outfile}.tmp; ".format(**locals()), job_memory="16G") except OSError as e: E.warn("input file {} gave the following errors: {}".format( infile, str(e))) return None else: retval = None def split_output(lines): is_comment = True section, body = None, [] for line in lines: if line.startswith("#"): if body: yield section, body body = [] is_comment = True else: # the following preserves new-line line = re.sub("\t#.*", "", line) fields = line[:-1].split("\t") section = fields[0] body.append(fields[1:]) is_comment = False if body: yield section, body # split into separate files for upload with IOTools.open_file(outfile + ".tmp") as inf: for section, body in split_output(inf): try: tablename, columns = self._map_section_to_table[section] except KeyError: continue output_file = self.map_table_to_file(tablename, outfile) with IOTools.open_file(output_file, "w") as outf: if len(columns) > 1 and columns[1].startswith("VAR_"): outf.write("{}\t{}\n".format(columns[0], columns[1][4:])) for data in body: outf.write("{}\t{}\n".format( data[0], ",".join(data))) else: outf.write("\t".join(columns) + "\n") # remove first column, which contains the identifier outf.write("\n".join(["\t".join(x) for x in body]) + "\n") os.rename(outfile + ".tmp", outfile) return retval
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--restrict-regex", dest="restrict_regex", action="append", help="pattern to restrict tests to certain tools/metrics. " "Can be specified multiple times [%default]") parser.add_option( "--data-directory", dest="data_directory", help="directory with sample data sets. This will override the default " "datadir in the configuration file and the environment variable " "DAISY_TEST_DATADIR [%default]") parser.add_option( "--library-directory", dest="library_directory", action="append", help="directory TaskLibrary functions. Will be added to the built-in " "and the one specified in DAISY_TASKLIBRARY environment variable " "[%default]") parser.add_option("--always-mount", dest="always_mount", action="store_true", help="force mounting of arvados keep [%default]") parser.add_option("--keep-failed-temp", dest="keep_failed_temp", action="store_true", help="keep temporary files of failed tests [%default]") parser.set_defaults( restrict_regex=[], always_mount=False, data_directory=None, keep_failed_temp=False, library_directories=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) P.get_parameters() # load the built-in tests filenames = [ os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary", "test_task_library.yml") ] if "DAISY_TASKLIBRARY" in os.environ: filenames.append( os.path.join(os.environ["DAISY_TASKLIBRARY"], "test_task_library.yml")) filenames.extend(options.library_directories) master_config = None for fn in filenames: if not os.path.exists(fn): E.warn("file {} does not exist".format(fn)) continue with IOTools.open_file(fn) as inf: raw_txt = inf.read() test_config = yaml.load(raw_txt) if test_config is None: E.warn("file {} is empty".format(fn)) continue data_directory = os.environ.get("DAISY_TEST_DATADIR", test_config.get("data_directory")) if options.data_directory: data_directory = options.data_directory # reload config with placeholders replaced test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt)) if master_config is None: master_config = test_config else: # add additional tool/test metrics master_config["tool"].update(test_config.get("tool", {})) master_config["metric"].update(test_config.get("metric", {})) for test_section, testclass, map_name_to_runner in [ ("tool", TestTool, map_tool_to_runner), ("metric", TestMetric, map_metric_to_runner) ]: ignore = master_config[test_section].get("ignore", []) # propagate config variables testclass.test_config = master_config for task, taskf in sorted(map_name_to_runner.items()): found = False for to_ignore in ignore: if re.match(to_ignore, task): found = True if found: continue if options.restrict_regex: take = False for x in options.restrict_regex: if re.search(x, task): take = True if not take: continue add_tests(task, taskf, testclass) failed = False with arvados_enabled(always_mount=options.always_mount): for testclass in [TestTool, TestMetric]: suite = unittest.TestLoader().loadTestsFromTestCase(testclass) result = unittest.TextTestRunner(verbosity=2).run(suite) failed |= not result.wasSuccessful() # remove all tests in test class - necessary if function is # called repeatedly clear_tests(testclass) E.stop() return failed
def run(self, outfile, params): min_job_memory = 3 if "-t" in params.options: job_threads = int(re.search("-t\s*(\d+)", params.options).groups()[0]) else: job_threads = 1 job_memory = "{}G".format( float(min_job_memory + 1.0 * job_threads) / job_threads) cram_fasta = params.cram_fasta if params.cram_fasta is None: cram_fasta = params.reference_fasta if params.set_readgroup or params.readgroup_id_regex is not None: readgroup_string, readgroup_id, readgroup_sample = build_readgroup_string( outfile, params) # pipes.quote needs to shlex.quote in py3 readgroup_option = "-R {}".format(pipes.quote(readgroup_string)) # add additional level of quoting: readgroup_option = re.sub("\\t", "\\\\t", readgroup_option) else: readgroup_option = "" fastq = " ".join(sra_peek(params.sra)) outfile = os.path.abspath(outfile) if params.extract_to_temp: tmpdir = P.get_temp_filename(clear=True) tmpdir_pre = "mkdir {};".format(tmpdir) tmpdir_post = "rm -rf {}".format(tmpdir) else: tmpdir = os.path.dirname(outfile) tmpdir_pre = "" tmpdir_post = "" # AH: fastq-dump hangs with arv mounts, thus try copying first if not IOTools.is_local(params.sra): E.warn("copying file {} to temporary directory".format(params.sra)) temp_sra = os.path.join( tmpdir, os.path.basename(params.sra)) fastq_dump = ( "cp {params.sra}* {tmpdir}; " "fastq-dump --split-files --gzip {temp_sra} >& {outfile}.dump.log ".format( **locals())) tmpdir_post = "rm -f {}*; {}".format( temp_sra, tmpdir_post) else: fastq_dump = ( "fastq-dump --split-files --gzip {params.sra} >& {outfile}.dump.log " ) return P.run( "{tmpdir_pre} " "cd {tmpdir}; " "{fastq_dump}; " "{self.path} mem -v 3 " "{readgroup_option} " "{params.options} " "{params.reference_fasta} " "{fastq} " "2> {outfile}.map.log " "| samtools view -O cram --reference {params.cram_fasta} /dev/stdin " "2> {outfile}.view.log " "| samtools sort -T {tmpdir} -O cram /dev/stdin " "2> {outfile}.sort.log " "> {outfile}; " "samtools index {outfile} >& {outfile}.index.log; " "{tmpdir_post}".format(**locals()))
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-n", "--dry-run", dest="dry_run", action="store_true", help="only show what will be done, don't do it [%default]") parser.add_option("-l", "--link", dest="link", action="store_true", help="link instead of rename [%default]") parser.set_defaults(dry_run=False, link=False) (options, args) = E.start(parser, argv) config = P.get_parameters("benchmark.yml") old_data, new_data = [], [] for old_info in glob.glob("*.dir/tool.info"): old_dir, old_file = os.path.split(old_info) old_info = Toolkit.read_data(old_info) old_data.append((old_dir, old_info)) tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config) config_files = Workflow.expand_globs(config["input"]) input_combos = Workflow.build_combinations(config_files) map_property_to_dir = collections.defaultdict(list) for toolf, input_files in itertools.product(tool_functions, input_combos): # create a copy of the task function and give it its unique name # by mangling it with the input_files taskf = copy.copy(toolf) taskf.register_input(input_files) result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir")) new_data.append((result_dir, taskf)) for a, x, y in IOTools.nested_iter(taskf.input_files): map_property_to_dir[(x, y)].append(result_dir) map_property_to_dir[("name", taskf.name)].append(result_dir) for x, y in list(taskf._option_dict.items()): map_property_to_dir[(x, y)].append(result_dir) # match by input_files options.stdout.write("\t".join(("old", "new", "matching")) + "\n") for old_dir, old_info in old_data: targets = [] for a, x, y in IOTools.nested_iter(old_info["input_files"]): if (x, y) in map_property_to_dir: targets.extend(map_property_to_dir[(x, y)]) for x, y in list(old_info.items()): try: targets.extend(map_property_to_dir[(x, y)]) except TypeError: pass counts = collections.Counter(targets) max_count = max(counts.values()) max_count_items = [ x for x, y in list(counts.items()) if y == max_count ] if len(max_count_items) > 1: E.warn("multiple matches for {}, ignored".format(old_dir)) continue new_dir = max_count_items[0] options.stdout.write("\t".join(map(str, (old_dir, new_dir, max_count))) + "\n") if os.path.exists(new_dir): raise ValueError("directory {} already exists".format(new_dir)) if options.dry_run: continue if options.link: os.symlink(old_dir, new_dir) else: os.rename(old_dir, new_dir) E.stop()
def run(self, outfile, params): local_options = [] outfile = os.path.abspath(outfile) outdir = os.path.dirname(outfile) # assumption is that index is called xyz.fa without the .fa. reference_fasta = IOTools.snip(params.reference_fasta, ".fa", ".fasta") if not os.path.exists(reference_fasta): raise ValueError("input reference {} does not exist".format(reference_fasta)) if "--jobs" in params.options or "-j" in params.options: job_threads = int(re.search("(--jobs|-j)\s*(\d+)", params.options).groups()[1]) else: job_threads = 8 if "--memory-limit" in params.options or "-m" in params.options: job_memory_gb = int(re.search("(--memory-limit|-m)\s*(\d+)", params.options).groups()[1]) else: job_memory_gb = 60 local_options.append("--memory-limit {}".format(job_memory_gb)) if job_memory_gb < 60: E.warn("isaac-align likely to require at least 60Gb of memory, {}G requested".format( job_memory_gb)) job_memory = "{}G".format(float(job_memory_gb) / job_threads) fastq_dir = os.path.join(outdir, "input_fastq") if not os.path.exists(fastq_dir): os.makedirs(fastq_dir) if len(params.fastq) == 2: if not os.path.exists(os.path.join(fastq_dir, "lane1_read1.fastq.gz")): os.symlink(os.path.abspath(params.fastq[0]), os.path.join(fastq_dir, "lane1_read1.fastq.gz")) if not os.path.exists(os.path.join(fastq_dir, "lane1_read2.fastq.gz")): os.symlink(os.path.abspath(params.fastq[1]), os.path.join(fastq_dir, "lane1_read2.fastq.gz")) else: raise NotImplementedError("expected 2 fastq files, received only {}".format(len(params.fastq))) intermediate_bam = os.path.join(outdir, "Aligned", "Projects", "default", "default", "sorted.bam") # picard statement to set readgroup picard_statement = self.build_picard_statement( intermediate_bam, outfile, params) tmpdir = os.path.join(outdir, "TEMP") local_options = " ".join(local_options) # isaac generates output files in working directory, so do a cd and make # sure that absolute path names are used elsewhere. statement = ( "cd {outdir}; " "{self.path} " "--reference-genome {reference_fasta}/sorted-reference.xml " "--base-calls {fastq_dir} " "--base-calls-format fastq-gz " "--temp-directory {tmpdir} " "--cleanup-intermediary 1 " "--bam-gzip-level {params.bam_gzip_level} " "{params.options} " "{local_options} " ">& {outfile}.isaac.log; " "{picard_statement}; " "rm -rf {tmpdir} " .format(**locals())) return P.run(statement)
def __call__(self, infiles, outfile, only_info=False): # NOTE: extras not implemented in ruffus 2.6.3, thus # use parameter: only_info = "only_info" in P.PARAMS # ensure output directory exists. # This should be done on the pipeline level, but # ruffus currently seems not to allow this. outdir = os.path.dirname(outfile) if outdir and not os.path.exists(outdir): os.makedirs(outdir) output_files = [ self.map_table_to_file(x, outfile) for x in self.tablenames ] kwargs = { 'output_files': output_files, 'input_files': infiles, 'outdir': outdir } if self._runtime_regex: kwargs["alias"] = self.build_alias(str(infiles), regex=self._runtime_regex, alias=self._runtime_alias) self.save_meta(outfile, **kwargs) if self.ignore: found = False for i in self.ignore: if i in outdir: found = True break if found: E.warn("skipping task {} at runtime, an empty file is created". format(outfile)) IOTools.touch_file(outfile) return # if self.runtime_filter: # TODO: create empty outfile if regex matches # pass if only_info: E.warn( "only_info - meta information in {} has been updated".format( IOTools.snip(outfile) + ".info")) return # AH: duplicated from above? params = self.build_params(output_files=output_files) on_error_options = ["raise", "ignore"] on_error = params.get("on_error", "raise") if on_error not in on_error_options: raise ValueError("unknown option to 'on_error': '{}' " "should be one of '{}'".format( on_error, ",".join(on_error_options))) if self.ignore_task(infiles, outfile, params): return # deal with placeholder files created by identity that are # located on a remote mount point def map_to_mount(fn): if os.path.exists(fn + ".mnt"): if not P.PARAMS["mount_point"]: raise ValueError( "encountered mounted file {}, but no mount point present" .format(fn)) with open(fn + ".mnt") as inf: mount_path = inf.read() return os.path.join(P.PARAMS["mount_point"], mount_path) else: return fn # replace infiles with mount locations if necessary if isinstance(infiles, list): infiles = [map_to_mount(x) for x in infiles] else: infiles = map_to_mount(infiles) try: benchmark = self.run(infiles, outfile, as_namedtuple(params)) except Exception as ex: on_error = params.get("on_error", "raise") if on_error == "raise": raise elif on_error == "ignore": E.warn( "error occured during execution of {} but will be ignored:\n{}" .format(self.__name__, ex)) E.warn( "an empty output file {} will be created.".format(outfile)) IOTools.touch_file(outfile) benchmark = None if benchmark: self.save_benchmark(outfile, benchmark)
def run(self, infiles, outfile, params): phase1_statements = [] outdir = os.path.dirname(outfile) retvals = [] vcf_files = [] for fn in infiles: prefix = re.search(params.regex_filename, fn).groups()[0] out_fn = os.path.join(outdir, "file_{}".format(prefix)) vcf_files.append(out_fn) if os.path.exists(out_fn + ".bcf"): continue phase1_statements.append("{self.path} ingest1 " "--output {out_fn} " "--fasta-ref {params.reference_fasta} " "{fn} " ">& {out_fn}.log; ".format(**locals())) phase2_statements = [] block_files = [] for start in range(0, len(vcf_files), self.block_size): out_fn = os.path.join(outdir, "block_{}".format(start)) block_files.append(out_fn) if os.path.exists(out_fn + ".bcf"): continue end = start + self.block_size files = " ".join( ["{}.bcf".format(x) for x in vcf_files[start:end]]) phase2_statements.append("{self.path} ingest2 " "--output {out_fn} " "{files} " ">& {out_fn}.log; ".format(**locals())) if phase2_statements: if phase1_statements: retvals.extend(P.run(phase1_statements, job_memory="4G")) else: E.warn("all files complete for phase 1") retvals.extend(P.run(phase2_statements, job_memory="4G")) else: E.warn("all files complete for phase 2") with pysam.VariantFile(block_files[0] + ".bcf") as bcf_file: contigs = list(bcf_file.header.contigs) files = " ".join(["{}.bcf".format(x) for x in block_files]) phase3_statements = [] chromosome_files = [] for contig in contigs: out_fn = os.path.join(outdir, "chr_{}.bcf".format(contig)) chromosome_files.append(out_fn) if os.path.exists(out_fn): continue phase3_statements.append( "{self.path} genotype " "--thread 4 " "--output-file {out_fn} " "--output-type b " "-r {contig} " "{files} " ">& {out_fn}.log; " "bcftools index {out_fn}".format(**locals())) retvals.extend(P.run(phase3_statements, job_memory="4G", job_threads=4)) if phase3_statements or not os.path.exists(outfile): files = " ".join(chromosome_files) retvals.extend( P.run("bcftools concat " "-o {outfile} " "-O z " "{files} " ">& {outfile}_concat.log; " "tabix -p vcf {outfile}".format(**locals()))) return retvals