def run(self, outfile, params): bam = resolve_argument(params.bam) reference_fasta = get_reference(params) stmnts = [] prefix = IOTools.snip(outfile, ".vcf.gz") vcf_output = prefix + ".raw.vcf.gz" if not os.path.exists(vcf_output): stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type HaplotypeCaller " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {outfile}.HaplotypeCaller.log " "{params.haplotypecaller} " "--out {vcf_output} " ">& {prefix}.HaplotypeCaller.err".format(**locals())) else: E.warn("output file {vcf_output} already exists - " "it will not be recomputed".format(**locals())) stmnts.extend(self.build_calibration_workflow( outfile, prefix, vcf_output, params)) return self.run_statements(stmnts, job_memory="5G")
def run(self, outfile, params): retvals = [] prefix = IOTools.snip(outfile, ".bed.gz") vcffile = prefix + ".vcf.gz" if not os.path.exists(vcffile): retvals.extend(run_tool_delly.run(self, vcffile, params)) statements = [] statements.append( "{self.path_bcftools} query " "{params.bcftools_options} " "-f \"%%CHROM\\t%%POS\\t%%END\\t%%SVTYPE\\n\" " "{vcffile} " "| awk -v OFS='\\t' '$3 != \".\" {{ switch ($4) {{" "case \"DEL\": $5=0; break; " "case \"DUP\": $5=3; break; " "case \"INS\": next; break; " "}}; print }}' " "| bgzip " "> {outfile}".format(**locals())) statements.append( "tabix -f -p bed {outfile}".format(**locals())) statement = "; ".join(statements) retvals.append(P.run(statement)) return retvals
def build_readgroup_string(outfile, params): if params.readgroup_id_regex is None: readgroup_id = IOTools.snip(os.path.basename(outfile), ".bam") else: try: readgroup_id = "-".join(re.search( params.readgroup_id_regex, outfile).groups()) except AttributeError as ex: raise AttributeError("regular expression {} does not match {}".format( params.readgroup_id_regex, outfile)) if params.readgroup_sample_regex is None: readgroup_sample = readgroup_id else: try: readgroup_sample = "-".join(re.search( params.readgroup_sample_regex, outfile).groups()) except AttributeError as ex: raise AttributeError("regular expression {} does not match {}".format( params.readgroup_sample_regex, outfile)) readgroup_string = "@RG\tID:{}\tSM:{}".format( readgroup_id, readgroup_sample) if params.readgroup_header: readgroup_string += "\t{}".format(params.readgroup_header) return readgroup_string, readgroup_id, readgroup_sample
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--regex-filename", dest="regex_filename", type="string", help="extract column name from filename via regular expression " "[%default]") parser.add_option("--filter", dest="filters", type="choice", action="append", choices=("PASS", "SNP"), help="apply filters to VCFs when reading " "[%default]") parser.set_defaults( regex_filename=None, filters=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) < 2: raise ValueError("requiring at least 2 input filenames") dfs = [] for filename in args: if options.regex_filename: try: name = re.search(options.regex_filename, filename).groups()[0] except AttributeError: raise ValueError( "regular expression '{}' does not match {}".format( options.regex_filename, filename)) else: name = iotools.snip(os.path.basename(filename), ".vcf.gz") E.debug("reading data from {}".format(filename)) df = read_vcf_positions_into_dataframe(filename, filters=options.filters) df[name] = 1 dfs.append(df) ndata = len(dfs) merged_df = dfs[0] for df in dfs[1:]: merged_df = merged_df.merge(df, how="outer") merged_df = merged_df.fillna(0) ddf = merged_df.drop(["chrom", "pos"], axis=1) set_counts = ddf.groupby(by=list(ddf.columns)).size() set_counts = set_counts.reset_index() set_counts.columns = list(set_counts.columns[:-1]) + ["counts"] set_counts.to_csv(options.stdout, sep="\t", index=False) E.stop()
def fastqscreen_filename2track(fn): """extract track name from fastqc filename. Because we deal with both paired end (track.fastq.1_fastqc and single end data (track_fastqc), this is a bit cumbersome. """ return re.sub(".fastq.", "-", iotools.snip(os.path.basename(fn), "_screen.txt"))
def submit_function(*args, **kwargs): if "submit" in kwargs and kwargs["submit"]: del kwargs["submit"] submit_args, args_file = _pickle_args(args, kwargs) module_file = os.path.abspath( sys.modules[func.__module__].__file__) submit(iotools.snip(__file__), "run_pickled", args=[iotools.snip(module_file), function_name, args_file], **submit_args) else: # remove job contral options before running function for x in ("submit", "job_options", "job_queue"): if x in kwargs: del kwargs[x] return func(*args, **kwargs)
def run(self, infile, outfile, params): outfile_pass = IOTools.snip(outfile, ".tsv") + "-pass.fastq.gz" outfile_fail = IOTools.snip(outfile, ".tsv") + "-fail.fastq.gz" statement = ("zcat {infile} " "| daisy fastq2fastq " "--method=filter-ONT " "--min-average-quality={params.min_average_quality} " "--log={outfile}.log " "--min-length={params.min_length} " "--output-removed-fastq={outfile_fail} " "--output-stats-tsv={outfile} " "- " "| gzip " "> {outfile_pass} " "".format(**locals())) return P.run(statement)
def run(self, outfile, params): if self.file is None: raise ValueError("tool 'identity' requires a 'file'") fn = self.file if isinstance(fn, list): if len(fn) == 1: fn = fn[0] else: raise NotImplementedError( "tool 'identity' called with multiple files: {}".format( fn)) source_fn = os.path.abspath(fn) def touch_and_mark_as_mounted(source, dest): o = os.stat(source) IOTools.touch_file(dest, times=(o.st_atime, o.st_mtime)) with open(dest + ".mnt", "w") as outf: outf.write(get_mounted_location(source)) if file_is_mounted(source_fn): link_f = touch_and_mark_as_mounted else: link_f = os.symlink if not os.path.exists(outfile): link_f(source_fn, outfile) if self.add_glob: if self.chop_suffix: source_fn = IOTools.snip(source_fn, self.chop_suffix) outfile = IOTools.snip(outfile, self.chop_suffix) prefix = len(os.path.basename(source_fn)) for fn in glob.glob(source_fn + self.add_glob): target = outfile + os.path.basename(fn)[prefix:] if not os.path.exists(target): link_f(os.path.abspath(fn), target)
def run(self, outfile, params): prefix = IOTools.snip(outfile, ".vcf.gz") bams = resolve_argument(params.bam, ",") reference_fasta = get_reference(params) statements, gvcfs = [], [] # TODO: sort out multi-threading for idx, bam in enumerate(bams.split(",")): output = prefix + "." + str(idx) + ".g.vcf" gvcfs.append(output) if os.path.exists(output): E.info("{} already exists - skipped".format(output)) continue statements.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type HaplotypeCaller " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--emitRefConfidence GVCF " "--logging_level INFO " "--log_to_file {prefix}.HaplotypeCaller.{idx}.log " "{params.haplotypecaller} " "--out {output} " ">& {prefix}.HaplotypeCaller.{idx}.err".format(**locals())) if statements: self.run_statements(statements, job_memory="4G") stmnts = [] gvcfs = " ".join(["--variant {}".format(x) for x in gvcfs]) vcf_output = prefix + ".raw.vcf.gz" stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type GenotypeGVCFs " "--reference_sequence {reference_fasta} " "{gvcfs} " "--logging_level INFO " "--log_to_file {prefix}.GenotypeGVCFs.log " "{params.genotypegvcfs} " "--out {vcf_output} " ">& {prefix}.GenotypeGVCFs".format(**locals())) stmnts.extend(self.build_calibration_workflow( outfile, prefix, vcf_output, params)) return self.run_statements(stmnts, job_memory="4G")
def summarizeFastqScreen(infiles, outfiles): all_files = [] for infile in infiles: all_files.extend(glob.glob(iotools.snip(infile, "screen") + "*_screen.txt")) if len(all_files) == 0: E.warn("no fastqcscreen results to concatenate") for x in outfiles: iotools.touch_file(x) return df_summary, df_details = readqc.read_fastq_screen( all_files) df_summary.to_csv(outfiles[0], sep="\t", index=True) df_details.to_csv(outfiles[1], sep="\t", index=True)
def run(self, infiles, outfile, params): if not outfile.endswith("-pass.fastq.gz"): raise ValueError( "outfile must end in -pass.fastq.gz, got {}".format(outfile)) if params.min_size_bytes: before = len(infiles) infiles = [ x for x in infiles if os.path.getsize(x) >= params.min_size_bytes ] E.debug( "removing small files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if params.newer_than: before = len(infiles) cutoff = os.path.getmtime(params.newer_than) infiles = [x for x in infiles if os.path.getmtime(x) > cutoff] E.debug( "removing old files: after={}, before={}, removed={}".format( len(infiles), before, before - len(infiles))) if len(infiles) == 0: E.warn("no files left after filtering, creating empty file") IOTools.touch_file(outfile) return infiles = " ".join(infiles) outfile_fail = IOTools.snip(outfile, "-pass.fastq.gz") + "-fail.fastq.gz" statement = ("zcat {infiles} " "| daisy fastq2fastq " "--method=filter-ONT " "--min-average-quality={params.min_average_quality} " "--log={outfile}.log " "--min-length={params.min_length} " "--output-removed-fastq={outfile_fail} " "- " "| gzip " "> {outfile}".format(**locals())) return P.run(statement)
def run(self, outfile, params): prefix = IOTools.snip(outfile, ".vcf.gz") bam = resolve_argument(params.bam, sep=",") reference_fasta = get_reference(params) bam = " ".join(["--input_file {}".format(x) for x in bam.split(",")]) stmnts = [] if not os.path.exists(prefix + ".annotated.vcf.gz"): tmpfile, pre_statement, post_statement = self.pre_process( params.vcf, outfile, params) stmnts.append(pre_statement) stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type VariantAnnotator " "--variant {tmpfile} " "{bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {prefix}.VariantAnnotator.log " "--annotation FisherStrand " "--annotation StrandOddsRatio " "--annotation ReadPosRankSumTest " "--annotation RMSMappingQuality " "--annotation MappingQualityRankSumTest " "{params.options} " "--out {prefix}.annotated.vcf.gz " ">& {prefix}.VariantAnnotator.err".format(**locals())) stmnts.extend(self.build_calibration_workflow( outfile, prefix, prefix + ".annotated.vcf.gz", params)) stmnts.append(post_statement) else: E.warn("using pre-existing file {} with annotated variants".format( prefix + ".annotated.vcf.gz")) stmnts.extend(self.build_calibration_workflow( outfile, prefix, prefix + ".annotated.vcf.gz", params)) return self.run_statements(stmnts, job_memory="3G")
def run(self, outfile, params): bam = resolve_argument(params.bam, sep=",") # "-T {outfile}.tmpdir -k " outfile = IOTools.snip(outfile, ".gz") # note that lumpy removes the temporary directory # after running, thus make sure it is unique and exists return P.run("{params.path} " "-B {bam} " "-o {outfile} " "-T %(tmpdir)s_{self.__name__} " "-v " "{params.options} " ">& {outfile}.log; " "vcf-sort {outfile} " "| bgzip > {outfile}.gz; " "tabix -p vcf {outfile}.gz" .format(**locals()))
def run(self, outfile, params): bam = resolve_argument(params.bam) # rename index from x.bai to x.bam.bai outprefix = IOTools.snip(outfile, ".bam", ".cram") statement = ("java -Xmx8000m -jar {params.path} " "MarkDuplicates " "INPUT={bam} " "TMP_DIR=%(tmpdir)s " "CREATE_INDEX=TRUE " "REFERENCE_SEQUENCE={params.reference_fasta} " "METRICS_FILE={outfile}.metrics " "{params.options} " "OUTPUT={outfile} " ">& {outfile}.log; " "mv {outprefix}.bai {outfile}.bai".format(**locals())) # 12G is required for java overhead return P.run(statement, job_memory="12G")
def buildFastQCSummaryStatus(infiles, outfile, datadir): '''collect fastqc status results from multiple runs into a single table. Arguments --------- infiles : list List of filenames with fastqc output (logging information). The track name is derived from that. outfile : list Output filename in :term:`tsv` format. datadir : string Location of actual Fastqc output to be parsed. track_regex : string Regular expression to extract track from filename. ''' outf = iotools.open_file(outfile, "w") names = set() results = [] for infile in infiles: base_track = iotools.snip(os.path.basename(infile), ".fastqc") filename = os.path.join(datadir, base_track + "*_fastqc", "fastqc_data.txt") # there can be missing sections for fn in glob.glob(filename): stats = collections.defaultdict(str) for name, status, header, data in FastqcSectionIterator( iotools.open_file(fn)): stats[name] = status track = fastqc_filename2track(fn) results.append((track, fn, stats)) names.update(list(stats.keys())) names = sorted(names) outf.write("track\tfilename\t%s\n" % "\t".join(names)) for track, fn, stats in results: outf.write("%s\t%s\t%s\n" % (track, os.path.dirname(fn), "\t".join(stats[x] for x in names))) outf.close()
def main(argv=None): if argv is None: argv = sys.argv modules = [] for module in glob.glob(os.path.join(os.path.dirname(__file__), "*.py")): if os.path.basename(module) in IGNORE: continue if "flycheck" in module: continue mod = "daisy.tools.{}".format(snip(os.path.basename(module))) modules.append(importlib.import_module(mod)) script_dict = {} for module in modules: try: f = [y for (x, y) in inspect.getmembers(module) if x == "main"][0] except IndexError: continue name = re.sub("_", "-", module.__name__.split(".")[-1]) script_dict[name] = f for synonym in SYNONYMS.get(name, []): script_dict[synonym] = f if len(argv) == 1: print('\n'.join(sorted(script_dict.keys()))) else: command_key = argv[1] command_args = argv[1:] command = script_dict[command_key] try: return command(command_args) except: print('When running {!r}'.format(command_key)) raise
def run(self, outfile, params): local_options = [] outfile = os.path.abspath(outfile) outdir = os.path.dirname(outfile) # assumption is that index is called xyz.fa without the .fa. reference_fasta = IOTools.snip(params.reference_fasta, ".fa", ".fasta") if not os.path.exists(reference_fasta): raise ValueError("input reference {} does not exist".format(reference_fasta)) if "--jobs" in params.options or "-j" in params.options: job_threads = int(re.search("(--jobs|-j)\s*(\d+)", params.options).groups()[1]) else: job_threads = 8 if "--memory-limit" in params.options or "-m" in params.options: job_memory_gb = int(re.search("(--memory-limit|-m)\s*(\d+)", params.options).groups()[1]) else: job_memory_gb = 60 local_options.append("--memory-limit {}".format(job_memory_gb)) if job_memory_gb < 60: E.warn("isaac-align likely to require at least 60Gb of memory, {}G requested".format( job_memory_gb)) job_memory = "{}G".format(float(job_memory_gb) / job_threads) fastq_dir = os.path.join(outdir, "input_fastq") if not os.path.exists(fastq_dir): os.makedirs(fastq_dir) if len(params.fastq) == 2: if not os.path.exists(os.path.join(fastq_dir, "lane1_read1.fastq.gz")): os.symlink(os.path.abspath(params.fastq[0]), os.path.join(fastq_dir, "lane1_read1.fastq.gz")) if not os.path.exists(os.path.join(fastq_dir, "lane1_read2.fastq.gz")): os.symlink(os.path.abspath(params.fastq[1]), os.path.join(fastq_dir, "lane1_read2.fastq.gz")) else: raise NotImplementedError("expected 2 fastq files, received only {}".format(len(params.fastq))) intermediate_bam = os.path.join(outdir, "Aligned", "Projects", "default", "default", "sorted.bam") # picard statement to set readgroup picard_statement = self.build_picard_statement( intermediate_bam, outfile, params) tmpdir = os.path.join(outdir, "TEMP") local_options = " ".join(local_options) # isaac generates output files in working directory, so do a cd and make # sure that absolute path names are used elsewhere. statement = ( "cd {outdir}; " "{self.path} " "--reference-genome {reference_fasta}/sorted-reference.xml " "--base-calls {fastq_dir} " "--base-calls-format fastq-gz " "--temp-directory {tmpdir} " "--cleanup-intermediary 1 " "--bam-gzip-level {params.bam_gzip_level} " "{params.options} " "{local_options} " ">& {outfile}.isaac.log; " "{picard_statement}; " "rm -rf {tmpdir} " .format(**locals())) return P.run(statement)
module_dirs = [os.path.join(os.path.dirname(__file__))] module_dirs.extend([ x.strip() for x in os.environ.get("DAISY_TASKLIBRARY", "").split(",") if x.strip() ]) modules = [] for idx, root in enumerate(module_dirs): for module in glob.glob(os.path.join(root, "*.py")): if "flycheck" in module: continue if module.endswith("__init__.py"): continue module_name = IOTools.snip(os.path.basename(module)) if idx == 0: modules.append( importlib.import_module("daisy.tasks.{}".format(module_name))) else: spec = importlib.util.spec_from_file_location( "daisy.UserLibrary.{}".format(module_name), module) foo = importlib.util.module_from_spec(spec) spec.loader.exec_module(foo) modules.append(foo) # TODO: use derivation instead of name prefix map_tool_to_runner = dict() map_metric_to_runner = dict() map_collate_to_runner = dict() map_split_to_runner = dict()
def runDE(design_file, counts_file, outfile, outdir, method="deseq", spike_file=None): '''run DESeq, DESeq2 or EdgeR through :mod:`scripts/runExpression.py` The job is split into smaller sections. The order of the input data is randomized in order to avoid any biases due to chromosomes and break up local correlations. At the end, a q-value is computed from all results. Arguments --------- design_file : string Filename with experimental design counts_file : string :term:`tsv` formatted file with counts per windows outfile : string Output filename in :term:`tsv` format. outdir : string Directory for additional output files. method : string Method to use. See :mod:`scripts/runExpression.py`. spike_file : string Filename with spike-in data to add before processing. ''' if spike_file is None: statement = "zcat %(counts_file)s" else: statement = '''cgat combine_tables --missing-value=0 --cat=filename --log=%(outfile)s.log %(counts_file)s %(spike_file)s | cgat csv_cut --remove filename --log=%(outfile)s.log ''' prefix = iotools.snip(os.path.basename(outfile)) E.info(prefix) # --bashrc=%(pipeline_scriptsdir)s/bashrc.cgat # the post-processing strips away the warning, # renames the qvalue column to old_qvalue # and adds a new qvalue column after recomputing # over all windows. statement += ''' | cgat randomize_lines --keep-header=1 | python -m cgatcore.pipeline.farm --method=multiprocessing --cluster-options="-l mem_free=16G" --cluster-queue=%(cluster_queue)s --cluster-num-jobs=%(cluster_num_jobs)i --cluster-priority=%(cluster_priority)i --cluster-queue-manager=%(cluster_queue_manager)s --cluster-memory-resource=%(cluster_memory_resource)s --cluster-memory-default=%(cluster_memory_default)s --input-header --output-header --split-at-lines=200000 --log=%(outfile)s.log --output-filename-pattern=%(outdir)s/%%s --subdirs --output-regex-header="^test_id" "python -m cgatpipelines.tasks.expression_runner --method=%(method)s --tags-tsv-file=%%STDIN%% --design-tsv-file=%(design_file)s --output-filename-pattern=%%DIR%%%(prefix)s_ --deseq-fit-type=%(deseq_fit_type)s --deseq-dispersion-method=%(deseq_dispersion_method)s --deseq-sharing-mode=%(deseq_sharing_mode)s --edger-dispersion=%(edger_dispersion)f --deseq2-design-formula=%(deseq2_model)s --deseq2-contrasts=%(deseq2_contrasts)s --filter-min-counts-per-row=%(tags_filter_min_counts_per_row)i --filter-min-counts-per-sample=%(tags_filter_min_counts_per_sample)i --filter-percentile-rowsums=%(tags_filter_percentile_rowsums)i --log=%(outfile)s.log --fdr=%(edger_fdr)f --deseq2-plot=0 " | perl -p -e "s/qvalue/old_qvalue/" | cgat table2table --log=%(outfile)s.log --method=fdr --column=pvalue --fdr-method=BH --fdr-add-column=qvalue | gzip > %(outfile)s ''' E.info(statement) P.run(statement)
def run(self, outfile, params): bam = resolve_argument(params.bam) reference_fasta = get_reference(params) stmnts = [] prefix = IOTools.snip(outfile, ".bam") stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type RealignerTargetCreator " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--logging_level INFO " "--log_to_file {outfile}.RealignerTargetCreator.log " "{params.realignertargetcreator} " "--out {outfile}.realign.intervals " ">& {outfile}.RealignerTargetCreator.err".format(**locals())) stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type IndelRealigner " "--input_file {bam} " "--reference_sequence {reference_fasta} " "--targetIntervals {outfile}.realign.intervals " "--logging_level INFO " "--log_to_file {outfile}.IndelRealigner.log " "{params.indelrealigner} " "--out @[email protected] " ">& {outfile}.IndelRealigner.err".format(**locals())) stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type BaseRecalibrator " "--input_file @[email protected] " "--reference_sequence {reference_fasta} " "--logging_level INFO " "{params.baserecalibrator} " "--log_to_file {outfile}.BaseRecalibrator.log " "--out {outfile}.recal_data.table " ">& {outfile}.BaseRecalibrator.err".format(**locals())) stmnts.append( "java " "-Djava.io.tmpdir=%(tmpdir)s " "-jar {self.path} " "--analysis_type PrintReads " "--input_file @[email protected] " "--reference_sequence {reference_fasta} " "--BQSR {outfile}.recal_data.table " "--logging_level INFO " "--log_to_file {outfile}.PrintReads.log " "--out {outfile} " ">& {outfile}.PrintReads.err".format(**locals())) stmnts.append( "mv {prefix}.bai {outfile}.bam.bai") return self.run_statements(stmnts, job_memory="3G")
def merge_and_load(infiles, outfile, suffix=None, columns=(0, 1), regex=None, row_wise=True, retry=True, options="", prefixes=None): '''merge multiple categorical tables and load into a database. The tables are merged and entered row-wise, i.e, the contents of each file are a row. For example, the statement:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load") with the two files:: > cat file1.txt Category Result length 12 width 100 > cat file2.txt Category Result length 20 width 50 will be added into table ``test_table`` as:: track length width file1 12 100 file2 20 50 If row-wise is set:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load", row_wise=True) ``test_table`` will be transposed and look like this:: track file1 file2 length 12 20 width 20 50 Arguments --------- infiles : list Filenames of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile`. suffix : string If `suffix` is given, the suffix will be removed from the filenames. columns : list The columns to be taken. By default, the first two columns are taken with the first being the key. Filenames are stored in a ``track`` column. Directory names are chopped off. If `columns` is set to None, all columns will be taken. Here, column names will receive a prefix given by `prefixes`. If `prefixes` is None, the filename will be added as a prefix. regex : string If set, the full filename will be used to extract a track name via the supplied regular expression. row_wise : bool If set to False, each table will be a column in the resulting table. This is useful if histograms are being merged. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. options : string Command line options for the `csv2db.py` script. prefixes : list If given, the respective prefix will be added to each column. The number of `prefixes` and `infiles` needs to be the same. ''' if len(infiles) == 0: raise ValueError("no files for merging") if suffix: header = ",".join([os.path.basename(snip(x, suffix)) for x in infiles]) elif regex: header = ",".join( ["-".join(re.search(regex, x).groups()) for x in infiles]) else: header = ",".join([os.path.basename(x) for x in infiles]) header_stmt = "--header-names=%s" % header if columns: column_filter = "| cut -f %s" % ",".join( map(str, [x + 1 for x in columns])) else: column_filter = "" if prefixes: assert len(prefixes) == len(infiles) header_stmt = "--prefixes=%s" % ",".join(prefixes) else: header_stmt = "--add-file-prefix" if infiles[0].endswith(".gz"): filenames = " ".join( ["<( zcat %s %s )" % (x, column_filter) for x in infiles]) else: filenames = " ".join( ["<( cat %s %s )" % (x, column_filter) for x in infiles]) if row_wise: transform = """| perl -p -e "s/bin/track/" | python -m cgatcore.table --transpose""" else: transform = "" load_statement = build_load_statement(to_table(outfile), options="--add-index=track " + options, retry=retry) statement = """python -m cgatcore.tables %(header_stmt)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s %(transform)s | %(load_statement)s > %(outfile)s """ to_cluster = False run(statement)
def run_report(clean=True, with_pipeline_status=True, pipeline_status_format="svg"): '''run cgatreport. This will also run ruffus to create an svg image of the pipeline status unless *with_pipeline_status* is set to False. The image will be saved into the export directory. ''' params = P.get_params() if with_pipeline_status: targetdir = params["exportdir"] if not os.path.exists(targetdir): os.mkdir(targetdir) ruffus.pipeline_printout_graph( os.path.join(targetdir, "pipeline.%s" % pipeline_status_format), pipeline_status_format, ["full"], checksum_level=params["ruffus_checksums_level"]) dirname, basename = os.path.split(P.get_caller().__file__) report_engine = params.get("report_engine", "cgatreport") assert report_engine in ('sphinxreport', 'cgatreport') docdir = os.path.join(dirname, "pipeline_docs", iotools.snip(basename, ".py")) themedir = os.path.join(dirname, "pipeline_docs", "themes") relpath = os.path.relpath(docdir) trackerdir = os.path.join(docdir, "trackers") # use a fake X display in order to avoid windows popping up # from R plots. xvfb_command = iotools.which("xvfb-run") # permit multiple servers using -d option if xvfb_command: xvfb_command += " -d " else: xvfb_command = "" # if there is no DISPLAY variable set, xvfb runs, but # exits with error when killing process. Thus, ignore return # value. # print os.getenv("DISPLAY"), "command=", xvfb_command if not os.getenv("DISPLAY"): erase_return = "|| true" else: erase_return = "" if os.path.exists("conf.py"): conf_dir = os.path.abspath(".") else: conf_dir = os.path.join(os.path.dirname(__file__), "configuration") # in the current version, xvfb always returns with an error, thus # ignore these. erase_return = "|| true" if clean: clean = "rm -rf report _cache _static;" else: clean = "" # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as # the virtual environment seems to be stripped. It is thus set to # the contents of the current sys.path syspath = ":".join(sys.path) statement = ''' %(clean)s (export SPHINX_DOCSDIR=%(docdir)s; export SPHINX_THEMEDIR=%(themedir)s; export PYTHONPATH=%(syspath)s; %(xvfb_command)s %(report_engine)s-build --num-jobs=%(report_threads)s sphinx-build -b html -d %(report_doctrees)s -c %(conf_dir)s -j %(report_threads)s %(docdir)s %(report_html)s >& report.log %(erase_return)s ) ''' P.run(statement) E.info( 'the report is available at %s' % os.path.abspath(os.path.join(params['report_html'], "contents.html")))
def pre_process(self, infile, outfile, params): statements = [] infile = IOTools.snip(infile, ".bam") tmpdir = P.get_parameters_as_namedtuple().tmpdir outprefix = os.path.basename(os.path.dirname(outfile)) if params.copy_bam: statements.append("cp @[email protected] @[email protected]; " "cp @[email protected] @[email protected]") if params.split_bam: statements.append("daisy bam2bam-split-reads " "-i @[email protected] " "-o - " "{params.split_bam} " "--log={outfile}_split_bam.log " "2> {outfile}_split_bam.err " "> @[email protected]; ".format(**locals())) if params.bam2bam: statements.append("daisy bam2bam " "--stdin=@[email protected] " "{params.bam2bam} " "--log={outfile}_bam2bam.log " "2> {outfile}_bam2bam.err " "> @[email protected]; ".format(**locals())) if params.region: statements.append( "samtools view -b @[email protected] {} > @[email protected]".format( params.region)) if params.shift_quality: statements.append("samtools view -h @[email protected] " "| perl -lane " "'if(/^@/) {{print; next;}} " "@qual=split(//, $F[10]); " "$_=chr(ord($_)+{}) for (@qual); " "$F[10]=join(\"\",@qual); " "print join(\"\\t\", @F)' " "| samtools view -bS > @[email protected]".format( params.shift_quality)) if is_true(params.remove_chr): # also substitute chrM to MT. statements.append("samtools view -h @[email protected] " "| awk -v OFS='\\t' '" "$1 == \"@SQ\" " "{{ gsub(\"chrM\", \"chrMT\", $2); " " gsub(\"chr\", \"\", $2); print; next }} " "{{ gsub(\"chrM\", \"chrMT\", $3); " " gsub(\"chr\", \"\", $3); print; next}} '" "| samtools view -bS - " "2> {outfile}_remove_chr.log " "> @[email protected]; ".format(**locals())) if not statements: return infile + ".bam", "", "" filename, build_statement, cleanup_statement = P.join_statements( statements, infile) filename += ".bam" build_statement += ( "; samtools index {filename} >& {outfile}.index.log".format( **locals())) return filename, build_statement, cleanup_statement
def run(self, infile, outfile, params): # TODO: bam_fastqc_sequence_length_distribution.tsv may # contain ranges such as '30-31'. Convert to beginning of # range like in this perl command: # # perl -p -i -e "s/\-\d+//" # *.dir/bam_fastqc.dir/bam_fastqc.tsv.bam_fastqc_sequence_length_distribution.tsv if infile.endswith(".gz"): prefix = IOTools.snip(os.path.basename(infile[:-3])) else: prefix = IOTools.snip(os.path.basename(infile)) outdir = os.path.dirname(outfile) datafile = os.path.join(outdir, "{}_fastqc".format(prefix), "fastqc_data.txt") if not os.path.exists(datafile): if not os.path.exists(outdir): os.makedirs(outdir) retval = P.run( "{params.path} " "{params.options} " "--extract " "--outdir {outdir} " "{infile} " ">& {outfile} ".format(**locals()), **params._asdict()) else: IOTools.touch_file(outfile) retval = None def _split_output(lines): body, header, section, status = [], None, None, None for line in lines: if line.startswith("##FastQC"): continue elif line.startswith("#"): header, body = line[1:-1].split("\t"), [] elif line.startswith(">>END_MODULE"): yield section, header, body, status body, header, section, status = [], None, None, None elif line.startswith(">>"): section, status = line[2:-1].split("\t") else: fields = line[:-1].split("\t") body.append(fields) # split into separate files for upload summary_data = [] with IOTools.open_file(datafile) as inf: for section, header, body, status in _split_output(inf): if len(body) == 0: continue summary_data.append((section, status)) tablename = "{}_".format(self.name) + re.sub( " ", "_", section).lower() if tablename not in self.tablenames: raise ValueError( "unknown tablename {}, expected one of {}".format( tablename, self.tablenames)) output_file = ".".join((outfile, tablename, "tsv")) with open(output_file, "w") as outf: outf.write("\t".join([x.lower() for x in header]) + "\n") # remove first column, which contains the identifier outf.write("\n".join(["\t".join(x) for x in body]) + "\n") output_file = ".".join( (outfile, "{}_summary".format(self.name), "tsv")) with IOTools.open_file(output_file, "w") as outf: outf.write("section\tstatus\n") for section, status in summary_data: outf.write("{}\t{}\n".format(section, status)) return retval
def __call__(self, infiles, outfile, only_info=False): # NOTE: extras not implemented in ruffus 2.6.3, thus # use parameter: only_info = "only_info" in P.PARAMS # ensure output directory exists. # This should be done on the pipeline level, but # ruffus currently seems not to allow this. outdir = os.path.dirname(outfile) if outdir and not os.path.exists(outdir): os.makedirs(outdir) output_files = [ self.map_table_to_file(x, outfile) for x in self.tablenames ] kwargs = { 'output_files': output_files, 'input_files': infiles, 'outdir': outdir } if self._runtime_regex: kwargs["alias"] = self.build_alias(str(infiles), regex=self._runtime_regex, alias=self._runtime_alias) self.save_meta(outfile, **kwargs) if self.ignore: found = False for i in self.ignore: if i in outdir: found = True break if found: E.warn("skipping task {} at runtime, an empty file is created". format(outfile)) IOTools.touch_file(outfile) return # if self.runtime_filter: # TODO: create empty outfile if regex matches # pass if only_info: E.warn( "only_info - meta information in {} has been updated".format( IOTools.snip(outfile) + ".info")) return # AH: duplicated from above? params = self.build_params(output_files=output_files) on_error_options = ["raise", "ignore"] on_error = params.get("on_error", "raise") if on_error not in on_error_options: raise ValueError("unknown option to 'on_error': '{}' " "should be one of '{}'".format( on_error, ",".join(on_error_options))) if self.ignore_task(infiles, outfile, params): return # deal with placeholder files created by identity that are # located on a remote mount point def map_to_mount(fn): if os.path.exists(fn + ".mnt"): if not P.PARAMS["mount_point"]: raise ValueError( "encountered mounted file {}, but no mount point present" .format(fn)) with open(fn + ".mnt") as inf: mount_path = inf.read() return os.path.join(P.PARAMS["mount_point"], mount_path) else: return fn # replace infiles with mount locations if necessary if isinstance(infiles, list): infiles = [map_to_mount(x) for x in infiles] else: infiles = map_to_mount(infiles) try: benchmark = self.run(infiles, outfile, as_namedtuple(params)) except Exception as ex: on_error = params.get("on_error", "raise") if on_error == "raise": raise elif on_error == "ignore": E.warn( "error occured during execution of {} but will be ignored:\n{}" .format(self.__name__, ex)) E.warn( "an empty output file {} will be created.".format(outfile)) IOTools.touch_file(outfile) benchmark = None if benchmark: self.save_benchmark(outfile, benchmark)