def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--regex-filename", dest="regex_filename", type="string", help="extract column name from filename via regular expression " "[%default]") parser.add_option("--filter", dest="filters", type="choice", action="append", choices=("PASS", "SNP"), help="apply filters to VCFs when reading " "[%default]") parser.set_defaults( regex_filename=None, filters=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) < 2: raise ValueError("requiring at least 2 input filenames") dfs = [] for filename in args: if options.regex_filename: try: name = re.search(options.regex_filename, filename).groups()[0] except AttributeError: raise ValueError( "regular expression '{}' does not match {}".format( options.regex_filename, filename)) else: name = IOTools.snip(os.path.basename(filename), ".vcf.gz") E.debug("reading data from {}".format(filename)) df = read_vcf_positions_into_dataframe(filename, filters=options.filters) df[name] = 1 dfs.append(df) ndata = len(dfs) merged_df = dfs[0] for df in dfs[1:]: merged_df = merged_df.merge(df, how="outer") merged_df = merged_df.fillna(0) ddf = merged_df.drop(["chrom", "pos"], axis=1) set_counts = ddf.groupby(by=list(ddf.columns)).size() set_counts = set_counts.reset_index() set_counts.columns = list(set_counts.columns[:-1]) + ["counts"] set_counts.to_csv(options.stdout, sep="\t", index=False) E.stop()
def fastqscreen_filename2track(fn): """extract track name from fastqc filename. Because we deal with both paired end (track.fastq.1_fastqc and single end data (track_fastqc), this is a bit cumbersome. """ return re.sub(".fastq.", "-", IOTools.snip(os.path.basename(fn), "_screen.txt"))
def summarizeFastqScreen(infiles, outfiles): all_files = [] for infile in infiles: all_files.extend(glob.glob(IOTools.snip(infile, "screen") + "*_screen.txt")) if len(all_files) == 0: E.warn("no fastqcscreen results to concatenate") for x in outfiles: IOTools.touch_file(x) return df_summary, df_details = PipelineReadqc.read_fastq_screen( all_files) df_summary.to_csv(outfiles[0], sep="\t", index=True) df_details.to_csv(outfiles[1], sep="\t", index=True)
def buildFastQCSummaryStatus(infiles, outfile, datadir): '''collect fastqc status results from multiple runs into a single table. Arguments --------- infiles : list List of filenames with fastqc output (logging information). The track name is derived from that. outfile : list Output filename in :term:`tsv` format. datadir : string Location of actual Fastqc output to be parsed. track_regex : string Regular expression to extract track from filename. ''' outf = IOTools.open_file(outfile, "w") names = set() results = [] for infile in infiles: base_track = IOTools.snip(os.path.basename(infile), ".fastqc") filename = os.path.join(datadir, base_track + "*_fastqc", "fastqc_data.txt") # there can be missing sections for fn in glob.glob(filename): stats = collections.defaultdict(str) for name, status, header, data in FastqcSectionIterator( IOTools.open_file(fn)): stats[name] = status track = fastqc_filename2track(fn) results.append((track, fn, stats)) names.update(list(stats.keys())) names = sorted(names) outf.write("track\tfilename\t%s\n" % "\t".join(names)) for track, fn, stats in results: outf.write("%s\t%s\t%s\n" % (track, os.path.dirname(fn), "\t".join(stats[x] for x in names))) outf.close()
def runDE(design_file, counts_file, outfile, outdir, method="deseq", spike_file=None): '''run DESeq, DESeq2 or EdgeR through :mod:`scripts/runExpression.py` The job is split into smaller sections. The order of the input data is randomized in order to avoid any biases due to chromosomes and break up local correlations. At the end, a q-value is computed from all results. Arguments --------- design_file : string Filename with experimental design counts_file : string :term:`tsv` formatted file with counts per windows outfile : string Output filename in :term:`tsv` format. outdir : string Directory for additional output files. method : string Method to use. See :mod:`scripts/runExpression.py`. spike_file : string Filename with spike-in data to add before processing. ''' if spike_file is None: statement = "zcat %(counts_file)s" else: statement = '''cgat combine_tables --missing-value=0 --cat=filename --log=%(outfile)s.log %(counts_file)s %(spike_file)s | cgat csv_cut --remove filename --log=%(outfile)s.log ''' prefix = IOTools.snip(os.path.basename(outfile)) E.info(prefix) # --bashrc=%(pipeline_scriptsdir)s/bashrc.cgat # the post-processing strips away the warning, # renames the qvalue column to old_qvalue # and adds a new qvalue column after recomputing # over all windows. statement += ''' | cgat randomize_lines --keep-header=1 | python -m CGATCore.Pipeline.farm --method=multiprocessing --cluster-options="-l mem_free=16G" --cluster-queue=%(cluster_queue)s --cluster-num-jobs=%(cluster_num_jobs)i --cluster-priority=%(cluster_priority)i --cluster-queue-manager=%(cluster_queue_manager)s --cluster-memory-resource=%(cluster_memory_resource)s --cluster-memory-default=%(cluster_memory_default)s --input-header --output-header --split-at-lines=200000 --log=%(outfile)s.log --output-filename-pattern=%(outdir)s/%%s --subdirs --output-regex-header="^test_id" "cgat runExpression --method=%(method)s --tags-tsv-file=%%STDIN%% --design-tsv-file=%(design_file)s --output-filename-pattern=%%DIR%%%(prefix)s_ --deseq-fit-type=%(deseq_fit_type)s --deseq-dispersion-method=%(deseq_dispersion_method)s --deseq-sharing-mode=%(deseq_sharing_mode)s --edger-dispersion=%(edger_dispersion)f --deseq2-design-formula=%(deseq2_model)s --deseq2-contrasts=%(deseq2_contrasts)s --filter-min-counts-per-row=%(tags_filter_min_counts_per_row)i --filter-min-counts-per-sample=%(tags_filter_min_counts_per_sample)i --filter-percentile-rowsums=%(tags_filter_percentile_rowsums)i --log=%(outfile)s.log --fdr=%(edger_fdr)f --deseq2-plot=0 " | perl -p -e "s/qvalue/old_qvalue/" | cgat table2table --log=%(outfile)s.log --method=fdr --column=pvalue --fdr-method=BH --fdr-add-column=qvalue | gzip > %(outfile)s ''' E.info(statement) P.run(statement)
def peek_parameters(workingdir, pipeline, on_error_raise=None, prefix=None, update_interface=False, restrict_interface=False): '''peek configuration parameters from external pipeline. As the paramater dictionary is built at runtime, this method executes the pipeline in workingdir, dumping its configuration values and reading them into a dictionary. If either `pipeline` or `workingdir` are not found, an error is raised. This behaviour can be changed by setting `on_error_raise` to False. In that case, an empty dictionary is returned. Arguments --------- workingdir : string Working directory. This is the directory that the pipeline was executed in. pipeline : string Name of the pipeline script. The pipeline is assumed to live in the same directory as the current pipeline. on_error_raise : Bool If set to a boolean, an error will be raised (or not) if there is an error during parameter peeking, for example if `workingdir` can not be found. If `on_error_raise` is None, it will be set to the default, which is to raise an exception unless the calling script is imported or the option ``--is-test`` has been passed at the command line. prefix : string Add a prefix to all parameters. This is useful if the paramaters are added to the configuration dictionary of the calling pipeline. update_interface : bool If True, this method will prefix any options in the ``[interface]`` section with `workingdir`. This allows transparent access to files in the external pipeline. restrict_interface : bool If True, only interface parameters will be imported. Returns ------- config : dict Dictionary of configuration values. ''' caller_locals = get_caller_locals() # check if we should raise errors if on_error_raise is None: on_error_raise = not isTest() and \ "__name__" in caller_locals and \ caller_locals["__name__"] == "__main__" # patch - if --help or -h in command line arguments, # do not peek as there might be no config file. if "--help" in sys.argv or "-h" in sys.argv: return {} # Attempt to locate directory with pipeline source code. This is a # patch as pipelines might be called within the repository # directory or from an installed location dirname = PARAMS["pipelinedir"] # called without a directory, use current directory if dirname == "": dirname = os.path.abspath(".") else: # if not exists, assume we want version located # in directory of calling script. if not os.path.exists(dirname): # directory is path of calling script dirname = os.path.dirname(caller_locals['__file__']) pipeline = os.path.join(dirname, pipeline) if not os.path.exists(pipeline): if on_error_raise: raise ValueError("can't find pipeline at %s" % (pipeline)) else: return {} if workingdir == "": workingdir = os.path.abspath(".") # patch for the "config" target - use default # pipeline directory if directory is not specified # working dir is set to "?!" if ("config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!"): workingdir = os.path.join(PARAMS.get("pipelinedir"), IOTools.snip(pipeline, ".py")) if not os.path.exists(workingdir): if on_error_raise: raise ValueError("can't find working dir %s" % workingdir) else: return {} statement = "python %s -f -v 0 dump" % pipeline process = subprocess.Popen(statement, cwd=workingdir, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # process.stdin.close() stdout, stderr = process.communicate() if process.returncode != 0: raise OSError( ("Child was terminated by signal %i: \n" "Statement: %s\n" "The stderr was: \n%s\n" "Stdout: %s") % (-process.returncode, statement, stderr, stdout)) # subprocess only accepts encoding argument in py >= 3.6 so # decode here. stdout = stdout.decode("utf-8").splitlines() # remove any log messages stdout = [x for x in stdout if x.startswith("{")] if len(stdout) > 1: raise ValueError("received multiple configurations") dump = json.loads(stdout[0]) # update interface if update_interface: for key, value in list(dump.items()): if key.startswith("interface"): dump[key] = os.path.join(workingdir, value) # keep only interface if so required if restrict_interface: dump = dict([(k, v) for k, v in dump.items() if k.startswith("interface")]) # prefix all parameters if prefix is not None: dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())]) return dump
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-a", "--first-gtf-file", dest="gtf_a", type="string", help="supply a gtf file - will compress uncompressed files") parser.add_option( "-b", "--second-gtf-file", dest="gtf_b", type="string", help="supply a second gtf file - will compress uncompressed files") parser.add_option("-s", "--scripts-dir", dest="scripts_dir", type="string", help="supply a location for accessory scripts") parser.add_option("--no-venn", dest="no_venn", action="store_true", help="set if no venn is to be drawn") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) gtf_files = [options.gtf_a, options.gtf_b] merged_files = [] prefices = [] E.info("merging gtf files") for gtf in gtf_files: if gtf.endswith(".gtf.gz"): outfile = IOTools.snip(gtf, ".gtf.gz") + ".merged.gtf.gz" prefices.append(IOTools.snip(gtf, ".gtf.gz")) merged_files.append(outfile) statement = '''zcat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % ( gtf, options.scripts_dir, outfile, outfile) P.execute(statement) elif gtf.endswith(".gtf"): outfile = IOTools.snip(gtf, ".gtf") + ".merged.gtf.gz" prefices.append(IOTools.snip(gtf, ".gtf")) merged_files.append(outfile) statement = '''cat %s | python %s/gtf2gtf.py --method=merge-transcripts --log=%s.log | gzip > %s''' % ( gtf, options.scripts_dir, outfile, outfile) E.execute(statement) else: raise ValueError("cannot perform merge on %s: is not a gtf file" % gtf) for prefix in prefices: if options.gtf_a.find(prefix) != -1: gtf_a = prefix + ".merged.gtf.gz" prefix_a = prefix elif options.gtf_b.find(prefix) != -1: gtf_b = prefix + ".merged.gtf.gz" prefix_b = prefix E.info("intersecting gtf files") # intersect the resulting merged files scriptsdir = options.scripts_dir intersection_out = "_vs_".join([prefix_a, prefix_b ]) + ".intersection.gtf.gz" statement = '''intersectBed -a %(gtf_a)s -b %(gtf_b)s -s -wa | python %(scriptsdir)s/gtf2gtf.py --method=merge-transcripts --log=log | gzip > %(intersection_out)s''' P.run() if not options.no_venn: E.info("producing venn diagram for %s vs %s..." % (options.gtf_a, options.gtf_b)) # produce the venn diagram intersection_file = intersection_out gtf_a_merged = gtf_a gtf_b_merged = gtf_b # create dictionary key gtf_pair = (gtf_a_merged, gtf_b_merged) # containers for counts count_gtf_merged_a = 0 count_gtf_merged_b = 0 count_intersection = 0 # create GTF iterator objects gtf_iterator_a = GTF.iterator(IOTools.open_file(gtf_pair[0])) gtf_iterator_b = GTF.iterator(IOTools.open_file(gtf_pair[1])) gtf_iterator_intersection = GTF.iterator( IOTools.open_file(intersection_file)) # do the counts for each file E.info("counting entries in %s" % gtf_a) for entry in gtf_iterator_a: count_gtf_merged_a += 1 print("counts for gtf-a: ", count_gtf_merged_a) E.info("counting entries in %s" % gtf_b) for entry in gtf_iterator_b: count_gtf_merged_b += 1 print("counts for gtf-b: ", count_gtf_merged_b) E.info("counting entries in %s" % intersection_file) for entry in gtf_iterator_intersection: count_intersection += 1 print("counts for intersection: ", count_intersection) # this is the important bit - basically take an arbitrary list of numbers to represent the list of lincrna in the refnoncoding set # then use the intersection count to represent the overlapping section in the lincrna set and add a set of random numbers to this # set to make up the remaining - non-overlapping set result = {} E.info("assembling count lists") result[gtf_pair] = { "gtf-b": list(map(str, range(count_gtf_merged_b))), "gtf-a": list(map(str, range(count_intersection))) + list( map(str, [ random.random() for i in range(count_intersection, count_gtf_merged_a) ])) } R_source = os.path.join(os.path.abspath(options.scripts_dir), "venn_diagram.R") R.source(R_source) prefix_a = prefix_a.replace(".", "_").replace("-", "_") prefix_b = prefix_b.replace(".", "_").replace("-", "_") R('''prefix.a <- "%s"''' % prefix_a) R('''prefix.b <- "%s"''' % prefix_b) E.info("drawing venn diagram to %s" % (prefix_a + "_vs_" + prefix_b + ".overlap.png")) R["venn.diagram2"](R.list(A=result[gtf_pair]["gtf-a"], B=result[gtf_pair]["gtf-b"]), prefix_a + "_vs_" + prefix_b + ".overlap.png", **{ 'cat.cex': 1.5, 'main.fontfamily': "Arial", 'cat.pos': FloatVector((0, 0)), 'cat.fontfamily': "Arial", 'main.cex': 1.8, 'height': 1000, 'width': 1000, 'cex': 2, 'fontfamily': "Arial", 'lwd': R.c(1, 1), 'fill': R.c(R.rgb(0, 0, 0.5, 0.5), R.rgb(0.5, 0, 0, 0.5)), 'category.names': R.c(prefix_a, prefix_b), 'margin': R.c(0.1, 0.1, 0.1, 0.1) }) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bam-file", dest="bam_file", type="string", help="supply input bam file name") parser.add_option("-g", "--gtf-file", dest="gtf_file", type="string", help="supply input gtf file name") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="supply output file name") parser.add_option( "-G", "--reference-gtf-file", dest="reference_gtf", type="string", help= "supply reference gtf for context of reads not contributing to transcripts" ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) ###################################################### ###################################################### # for all alignments ###################################################### ###################################################### # open outfile and prepare headers outf = IOTools.open_file(options.outfile, "w") outf.write("\t".join([ "total alignments", "aligments in transcripts", "percent alignments in transcripts", "total spliced alignments", "spliced alignments in transcripts", "percent spliced alignments in transcripts" ]) + "\n") # calculate coverage over transcript file - NB split reads contribute twice to the transcript # use BedTool object pybedbamfile = pybedtools.BedTool(options.bam_file) # count alignments E.info("counting total number of alignments and spliced alignments") total_alignments = 0 spliced_alignments = 0 for alignment in pybedbamfile: cigar = alignment[5] if cigar.find("N") != -1: # N signifies split read total_alignments += 1 spliced_alignments += 1 else: total_alignments += 1 # merge the gtf file to avoid double counting of exons in different # transcripts - converts to a bed file gtffile = pybedtools.BedTool(options.gtf_file).merge() E.info("computing coverage of aligments in %s over intervals in %s" % (options.bam_file, options.gtf_file)) cover = pybedbamfile.coverage(gtffile) # make sure that the exons aren't being counted twice - shouldn't be # because of merge E.info("counting reads contributing to transcripts") c = 0 for entry in cover: coverage = int(entry[3]) if coverage > 0: c += coverage # sum the coverage across exons from all transcripts coverage_in_transcripts = c ###################################################### ###################################################### # for spliced alignments ###################################################### ###################################################### # count total number of spliced alignments # requires that the CIGAR string 'N' is present # uses pysam to write out a bam file of the spliced reads only allreads = pysam.AlignmentFile(options.bam_file) spliced_bamname = IOTools.snip(options.bam_file, ".bam") + "_spliced_reads.bam" # open file for outputting spliced alignments splicedreads = pysam.AlignmentFile(spliced_bamname, "wb", template=allreads) # cigar string in pysam for spliced alignment is (3, int) spliced = collections.defaultdict(list) for read in allreads: for cigar_tag in read.cigar: if cigar_tag[0] == 3: spliced[read].append(cigar_tag) # write out spliced alignments for read in list(spliced.keys()): splicedreads.write(read) splicedreads.close() allreads.close() # index splice reads bam file pysam.sort(spliced_bamname, P.snip(spliced_bamname, ".bam")) pysam.index(spliced_bamname) # read in the spliced reads as a BedTool object splicedbam = pybedtools.BedTool(spliced_bamname) # perform coverage of spliced reads over intervals - will be twice # as many as there should be due to counting both exons # overlapping spliced_coverage = splicedbam.coverage(gtffile) # avoid double counting exons E.info("counting spliced reads contributing to transcripts") spliced_exons = {} c = 0 for entry in spliced_coverage: coverage = int(entry[3]) if coverage > 0: c += coverage spliced_coverage_in_transcripts = c # NOTE: the counting of spliced alignments is not accurate spliced_coverage_in_transcripts = float( spliced_coverage_in_transcripts) / 2 ########################### # write out the results ########################### outf.write(str(int(total_alignments)) + "\t") # remove half of the coverage assigned to spliced reads coverage_in_transcripts = (coverage_in_transcripts) - ( spliced_coverage_in_transcripts) outf.write( str( int(coverage_in_transcripts) - int(spliced_coverage_in_transcripts)) + "\t") outf.write( str(int((coverage_in_transcripts / total_alignments) * 100)) + "\t") # write out spliced counts outf.write(str(int(spliced_alignments)) + "\t") outf.write(str(int(spliced_coverage_in_transcripts)) + "\t") outf.write( str(int((spliced_coverage_in_transcripts / spliced_alignments) * 100))) outf.close() ############################ # contextualise those that # don't fall in transcripts ############################ if options.reference_gtf: context_summary = IOTools.open_file( IOTools.snip(options.bam_file, ".bam") + ".excluded.context", "w") context_summary.write("\t".join(["Feature", "number"]) + "\n") # write out the read info as well context_file = IOTools.open_file( IOTools.snip(options.bam_file, ".bam") + ".excluded", "w") context_dict = collections.defaultdict(int) # intersect bam - write non-overlapping with transcripts - intersect # with reference - write out context = pybedbamfile.intersect(gtffile, v=True, bed=True).intersect( pybedtools.BedTool(options.reference_gtf), wb=True) for entry in context: feature = entry[8] context_dict[feature] += 1 context_file.write("\t".join([e for e in entry]) + "\n") for feature, value in context_dict.items(): context_summary.write("\t".join([feature, str(value)]) + "\n") context_file.close() context_summary.close() # write footer and output benchmark information. E.stop()
def run_report(clean=True, with_pipeline_status=True, pipeline_status_format="svg"): '''run CGATreport. This will also run ruffus to create an svg image of the pipeline status unless *with_pipeline_status* is set to False. The image will be saved into the export directory. ''' params = P.get_params() if with_pipeline_status: targetdir = params["exportdir"] if not os.path.exists(targetdir): os.mkdir(targetdir) ruffus.pipeline_printout_graph( os.path.join( targetdir, "pipeline.%s" % pipeline_status_format), pipeline_status_format, ["full"], checksum_level=params["ruffus_checksums_level"] ) dirname, basename = os.path.split(P.get_caller().__file__) report_engine = params.get("report_engine", "cgatreport") assert report_engine in ('sphinxreport', 'cgatreport') docdir = os.path.join(dirname, "pipeline_docs", IOTools.snip(basename, ".py")) themedir = os.path.join(dirname, "pipeline_docs", "themes") relpath = os.path.relpath(docdir) trackerdir = os.path.join(docdir, "trackers") # use a fake X display in order to avoid windows popping up # from R plots. xvfb_command = IOTools.which("xvfb-run") # permit multiple servers using -d option if xvfb_command: xvfb_command += " -d " else: xvfb_command = "" # if there is no DISPLAY variable set, xvfb runs, but # exits with error when killing process. Thus, ignore return # value. # print os.getenv("DISPLAY"), "command=", xvfb_command if not os.getenv("DISPLAY"): erase_return = "|| true" else: erase_return = "" if os.path.exists("conf.py"): conf_dir = os.path.abspath(".") else: conf_dir = os.path.join(os.path.dirname(__file__), "configuration") # in the current version, xvfb always returns with an error, thus # ignore these. erase_return = "|| true" if clean: clean = "rm -rf report _cache _static;" else: clean = "" # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as # the virtual environment seems to be stripped. It is thus set to # the contents of the current sys.path syspath = ":".join(sys.path) statement = ''' %(clean)s (export SPHINX_DOCSDIR=%(docdir)s; export SPHINX_THEMEDIR=%(themedir)s; export PYTHONPATH=%(syspath)s; %(xvfb_command)s %(report_engine)s-build --num-jobs=%(report_threads)s sphinx-build -b html -d %(report_doctrees)s -c %(conf_dir)s -j %(report_threads)s %(docdir)s %(report_html)s >& report.log %(erase_return)s ) ''' P.run(statement) E.info('the report is available at %s' % os.path.abspath( os.path.join(params['report_html'], "contents.html")))