def parse_commandline(argv=None, **kwargs): """parse command line. Create option parser and parse command line. Arguments --------- argv : list List of command line options to parse. If None, use sys.argv. **kwargs: dict Additional arguments overwrite default option settings. Returns ------- options: object Command line options container args : list List of command line arguments """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=USAGE) parser.add_option("--pipeline-action", dest="pipeline_action", type="choice", choices=("make", "show", "plot", "dump", "config", "clone", "check", "regenerate", "state", "printconfig"), help="action to take [default=%default].") parser.add_option("--pipeline-format", dest="pipeline_format", type="choice", choices=("dot", "jpg", "svg", "ps", "png"), help="pipeline format [default=%default].") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="perform a dry run (do not execute any shell " "commands) [default=%default].") parser.add_option("-c", "--config-file", dest="config_file", help="benchmark configuration file " "[default=%default].") parser.add_option("-f", "--force-run", dest="force_run", type="string", help="force running the pipeline even if there are " "up-to-date tasks. If option is 'all', all tasks " "will be rerun. Otherwise, only the tasks given as " "arguments will be rerun. " "[default=%default].") parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int", help="number of parallel processes to use on " "submit host " "(different from number of jobs to use for " "cluster jobs) " "[default=%default].") parser.add_option("-e", "--exceptions", dest="log_exceptions", action="store_true", help="echo exceptions immediately as they occur " "[default=%default].") parser.add_option("-i", "--terminate", dest="terminate", action="store_true", help="terminate immediately at the first exception " "[default=%default].") parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debugging information on console, " "and not the logfile " "[default=%default].") parser.add_option("-s", "--set", dest="variables_to_set", type="string", action="append", help="explicitely set paramater values " "[default=%default].") parser.add_option("--checksums", dest="ruffus_checksums_level", type="int", help="set the level of ruffus checksums" "[default=%default].") parser.add_option("-t", "--is-test", dest="is_test", action="store_true", help="this is a test run" "[default=%default].") parser.add_option("--engine", dest="engine", choices=("local", "arvados"), help="engine to use." "[default=%default].") parser.add_option("--always-mount", dest="always_mount", action="store_true", help="force mounting of arvados keep [%default]") parser.add_option("--only-info", dest="only_info", action="store_true", help="only update meta information, do not run " "[default=%default].") parser.add_option( "--work-dir", dest="work_dir", type="string", help="working directory. Will be created if it does not exist " "[default=%default].") group = E.OptionGroup(parser, "Pipeline logging configuration") group.add_option("--pipeline-logfile", dest="pipeline_logfile", type="string", help="primary logging destination." "[default=%default].") group.add_option("--shell-logfile", dest="shell_logfile", type="string", help="filename for shell debugging information. " "If it is not an absolute path, " "the output will be written into the current working " "directory. If unset, no logging will be output. " "[default=%default].") parser.add_option("--input-validation", dest="input_validation", action="store_true", help="perform input validation before starting " "[default=%default].") parser.add_option_group(group) parser.set_defaults(pipeline_action=None, pipeline_format="svg", pipeline_targets=[], force_run=False, multiprocess=None, pipeline_logfile="pipeline.log", shell_logfile=None, dry_run=False, log_exceptions=True, engine="local", exceptions_terminate_immediately=None, debug=False, variables_to_set=[], is_test=False, ruffus_checksums_level=0, config_file="benchmark.yml", work_dir=None, always_mount=False, only_info=False, input_validation=False) parser.set_defaults(**kwargs) if "callback" in kwargs: kwargs["callback"](parser) logger_callback = setup_logging (options, args) = E.start(parser, add_cluster_options=True, argv=argv, logger_callback=logger_callback) return options, args
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--input-filename-fasta", dest="input_filename_fasta", type="string", help="filename with reference sequence in fasta format [%default]") parser.add_option( "--input-filename-bam", dest="input_filename_bam", type="string", help="filename with aligned reads [%default]") parser.add_option( "--method", dest="methods", type="choice", action="append", choices=["add-strelka-genotype", "lift-over"], help="methods to apply [%default]") parser.add_option( "--input-filename-chain", dest="input_filename_chain", type="string", help="filename with alignment chain for lift-over [%default]") parser.add_option( "--normal-sample-regex", dest="normal_sample_regex", type="string", help="regular expression to apply to header to identify normal " "sample id [%default]") parser.add_option( "--output-filename-unmapped", dest="output_filename_unmapped", type="string", help="filename with variants that could not be lifted over [%default]") parser.set_defaults( input_filename_fasta=None, input_filename_bam=None, input_filename_vcf="-", sample_size=0.001, region_size=20, methods=[], normal_sample_regex=None, input_filename_chain=None, output_filename_unmapped=None, ) (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) > 0: options.input_filename_vcf = args[0] vcf_in = pysam.VariantFile(options.input_filename_vcf) if "lift-over" in options.methods: if options.input_filename_chain is None: raise ValueError("--method=lift-over requires --input-filename-chain") if not os.path.exists(options.input_filename_chain): raise OSError("file {} with chain data does not exist".format( options.input_filename_chain)) E.info("reading chain from {}".format(options.input_filename_chain)) with IOTools.open_file(options.input_filename_chain) as inf: map_chain, map_contig2length = read_liftover_chain(inf) if options.input_filename_fasta: fasta = pysam.FastaFile(options.input_filename_fasta) else: fasta = None if options.input_filename_bam: bam = pysam.AlignmentFile(options.input_filename_bam) else: bam = None outf = options.stdout c = E.Counter() if "add-strelka-genotype" in options.methods: map_nt2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1", "conflict": "."} map_tumour2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1"} header = str(vcf_in.header).splitlines() header.insert( len(header) - 1, '##FORMAT=<ID=GT,Number=1,Type=String,Description=' '"Genotypes of reference and alternative alleles, ' 'added by CGATCore vcf2vcf.">') header = "\n".join(header) if options.normal_sample_regex: normal_sample = re.search(" -bam-file \S+/([^/]+)_S\d+.bam", header).groups()[0] else: normal_sample = "NORMAL" is_first = True for record in vcf_in: c.input += 1 if "GT" in record.format: if is_first: outf.write(header + "\n") is_first = False outf.write(str(record)) c.has_gt += 1 continue gt_normal = map_nt2gt[record.info["NT"]] gt_tumour = record.info["SGT"] norm, tumour = gt_tumour.split("->") if gt_tumour[0] in "ACGT": alts = record.alts if alts is None: c.no_alt += 1 continue if len(record.alts) > 1: c.multi_allelic += 1 continue _map_tumour2gt = { record.alts[0]: "1", record.ref: "0"} try: gt_tumour = "/".join( sorted([_map_tumour2gt[x] for x in tumour])) except KeyError: gt_tumour = "." c.ambigous_genotype += 1 else: gt_tumour = map_tumour2gt[tumour] fields = str(record)[:-1].split("\t") # FORMAT fields[8] = ":".join(("GT", fields[8])) # SAMPLES # makes a few assumptions, fix! header_insert_normal = False if len(fields) == 11: fields[9] = ":".join((gt_normal, fields[9])) fields[10] = ":".join((gt_tumour, fields[10])) elif len(fields) == 10: header_insert_normal = True values = fields[9].split(":") fields.append(":".join((gt_tumour, fields[9]))) fields[9] = ":".join([gt_normal] + ["."] * len(values)) else: raise NotImplementedError() if is_first: if not header_insert_normal: outf.write(header + "\n") else: header = re.sub(r"\tFORMAT\t", "\tFORMAT\t%s\t" % normal_sample, header) outf.write(header + "\n") is_first = False outf.write("\t".join(fields) + "\n") c.output += 1 elif "lift-over" in options.methods: header = str(vcf_in.header).splitlines() if fasta: # validate contig size expected_lengths = dict(list(zip(fasta.references, fasta.lengths))) else: expected_lengths = map_contig2length # update contig names and sizes in VCF header header = [x for x in header if not x.startswith("##contig")] header[-1:-1] = ["##contig=<ID={},length={}>".format( contig, length) for contig, length in sorted(expected_lengths.items())] header.insert( len(header) - 1, '##liftover=<CHAIN={},REFERENCE={}>'.format( options.input_filename_chain, options.input_filename_fasta)) outf.write("\n".join(header) + "\n") unmapped_contigs = set() unknown_contigs = set() trans_genotypes = str.maketrans("01", "10") if fasta: # validate contig size expected_lengths = dict(list(zip(fasta.references, fasta.lengths))) for contig, length in list(map_contig2length.items()): if contig in expected_lengths: if length != expected_lengths[contig]: raise ValueError( "contig lengths mismatch. For contig {} chain files " "says {}, but fasta files says {}".format( contig, length, expected_lengths[contig])) E.info("contig sizes in chain file and fasta files correspond.") if options.output_filename_unmapped: outfile_unmapped = IOTools.open_file(options.output_filename_unmapped, "w") outfile_unmapped.write("\n".join(header) + "\n") else: outfile_unmapped = None for record in vcf_in: c.input += 1 try: mm = map_chain[record.contig] except KeyError: c.skipped_unmapped_contig += 1 unmapped_contigs.add(record.contig) if outfile_unmapped: outfile_unmapped.write("skipped_unmapped_contig\t{}".format(str(record))) continue try: m = mm.search(record.start, record.stop) except AttributeError: c.skipped_mapping_error += 1 if outfile_unmapped: outfile_unmapped.write("skipped_mapping_error\t{}".format(str(record))) continue if len(m) == 0: c.skipped_unmapped_position += 1 if outfile_unmapped: outfile_unmapped.write("skipped_unmapped_position\t{}".format(str(record))) continue elif len(m) > 1: c.skipped_multimapping_position += 1 if outfile_unmapped: outfile_unmapped.write("skipped_multimapping_position\t{}".format(str(record))) continue m = m[0] y_contig, y_start, y_end, y_invert = m.data if y_invert: y_pos = y_end - (record.start - m.start) else: y_pos = (record.start - m.start) + y_start if fasta: try: ref_base = fasta.fetch(y_contig, y_pos, y_pos + len(record.ref)).upper() except KeyError: c.skipped_unknown_contig += 1 unknown_contigs.add(y_contig) ref_base = None continue swap_alleles = False if ref_base: error = False if ref_base == record.ref: c.matches += 1 else: if len(record.alts) == 1: alt_base = record.alts[0] if ref_base == alt_base: swap_alleles = True c.allele_swap_variant += 1 else: c.error_mismatch_variant += 1 error = "mismatch" else: error = "multi-mismatch" c.error_multi_mismatch_variant += 1 if error: if outfile_unmapped: outfile_unmapped.write("{}\t{}".format(error, str(record))) c.skipped_error_variant += 1 continue fields = str(record)[:-1].split("\t") fields[0] = y_contig fields[1] = str(y_pos) if swap_alleles: fields[4] = alt_base fields[5] = ref_base # update genotype fields keep = False for idx in range(9, len(fields)): gt, rest = fields[idx].split(":", 1) keep = keep or "0" in gt fields[idx] = ":".join((gt.translate(trans_genotypes), rest)) # remove reference only calls if not keep: if outfile_unmapped: outfile_unmapped.write("reference_call\t{}".format(str(record))) c.skipped_allele_swap_reference += 1 continue c.output += 1 outf.write("\t".join(fields) + "\n") c.unmapped_contigs = len(unmapped_contigs) c.unknown_contigs = len(unknown_contigs) E.info(c.asTable()) if unknown_contigs: E.info("unknown contigs: {}".format(",".join(sorted(unknown_contigs)))) if unmapped_contigs: E.info("unmapped contigs: {}".format(",".join(sorted(unmapped_contigs)))) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--filter-query", dest="filename_filter_query", type="string", help="filename with intervals in the query " "to filter (in gff format) [default=%default].") parser.add_option("--filter-target", dest="filename_filter_target", type="string", help="filename with intervals in the target to " "filter (in gff format) [default=%default].") parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=("map", "merge", "add-sequence", "complement", "select-query", "test", "filter-keep", "filter-remove", "rename-query", "sanitize", "filter-fasta", "remove-overlapping-query", "remove-overlapping-target"), help="""action to perform [default=%default].""") parser.add_option("--select", dest="select", type="choice", choices=("most-nmatches", "least-nmatches", "most-nmismatches", "least-nmismatches"), help="entry to select [default=%default].") parser.add_option("--header-names", dest="header", type="choice", choices=("none", "table", "full"), help="output psl header [default=%default].") parser.add_option("--format", dest="format", type="choice", choices=("gff", "gtf"), help="format of intervals [default=%default].") parser.add_option("--queries-tsv-file", dest="filename_queries", type="string", help="fasta filename with queries.") parser.add_option("--target-psl-file", dest="filename_sbjcts", type="string", help="fasta filename with sbjct [default=%default].") parser.add_option("--id-format", dest="id_format", type="string", help="format of new identifiers for the rename " "function [default=%default].") parser.add_option("--unique", dest="unique", action="store_true", help="in the rename function, make each match " "unique [default=%default].") parser.add_option("--output-filename-map", dest="output_filename_map", type="string", help="filename with map of old to new labels for " "rename function [default=%default].") parser.add_option("--complement-min-length", dest="complement_min_length", type="int", help="minimum length for complemented blocks " "[default=%default].") parser.add_option("--complement-border", dest="complement_border", type="int", help="number of residues to exclude before alignment " "at either end [default=%default].") parser.add_option("--complement-aligner", dest="complement_aligner", type="choice", choices=("clustal", "dba", "dialign", "dialign-lgs"), help="aligner for complemented segments " "[default=%default].") parser.add_option("--threshold-merge-distance", dest="threshold_merge_distance", type="int", help="distance in nucleotides at which two adjacent " "reads shall be merged even if they are not " "overlapping [%default].") parser.add_option("--test", dest="test", type="int", help="for debugging purposes - stop after x " "iterations [default=%default].") parser.set_defaults(filename_filter_target=None, filename_filter_query=None, filename_queries=None, filename_sbjcts=None, threshold_merge_distance=0, report_step=100000, min_aligned=100, methods=[], format="gff", select="most-nmatches", id_format="%06i", unique=False, output_filename_map=None, header=None, test=None) (options, args) = E.start(parser, add_pipe_options=True) if options.filename_queries: query_fasta = IndexedFasta.IndexedFasta(options.filename_queries) else: query_fasta = None if options.filename_sbjcts: sbjct_fasta = IndexedFasta.IndexedFasta(options.filename_sbjcts) else: sbjct_fasta = None if "add-sequence" in options.methods and \ (sbjct_fasta is None or query_fasta is None): raise ValueError( "please supply both indexed query and " "target/genome sequence data.") iterator = Blat.iterator(options.stdin) if options.header is not None or options.header != "none": if options.header == "table": options.stdout.write("\t".join(Blat.FIELDS) + "\n") elif options.header == "full": options.stdout.write(Blat.HEADER + "\n") for method in options.methods: if "map" == method: pslMap(options) break elif "filter-keep" == method: pslFilter(options, keep=True) break elif "filter-remove" == method: pslFilter(options, keep=False) break elif "merge" == method: pslMerge(options) break elif "add-sequence" == method: pslAddSequence(query_fasta, sbjct_fasta, options) break elif "complement" == method: pslComplement(query_fasta, sbjct_fasta, options) break elif "select-query" == method: pslSelectQuery(options) break elif "test" == method: iterator = Blat.iterator_test(iterator, options.report_step) elif "rename-query" == method: iterator = iterator_rename_query(iterator, options) elif "sanitize" == method: iterator = iterator_sanitize( iterator, query_fasta, sbjct_fasta, options) elif "filter-fasta" == method: iterator = iterator_filter_fasta( iterator, query_fasta, sbjct_fasta, options) elif "remove-overlapping-query" == method: iterator = iterator_filter_overlapping_query(iterator, options) elif "remove-overlapping-target" == method: iterator = iterator_filter_overlapping_target(iterator, options) for psl in iterator: options.stdout.write("%s\n" % str(psl)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: snp2table.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-a", "--annotations-tsv-file", dest="filename_annotations", type="string", help= "filename with base annotations (output from gtf2fasta.py) [default=%default]." ) parser.add_option( "-f", "--exons-file", dest="filename_exons", type="string", help= "filename with exon information (gff formatted file) [default=%default]." ) parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help= "filename with junction information (filename with exon junctions) [default=%default]." ) parser.add_option("-c", "--vcf-file", dest="filename_vcf", type="string", help="vcf file to parse [default=%default].") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("pileup", "vcf"), help="input format [default=%default].") parser.add_option( "--vcf-sample", dest="vcf_sample", type="string", help="sample id in vcf file to analyse [default=%default].") parser.set_defaults( genome_file=None, filename_annotations=None, filename_exons=None, filename_junctions=None, input_format="pileup", vcf_sample=None, filename_vcf=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_junctions: junctions = readJunctions(options.filename_junctions) else: junctions = None # setup iterator if options.input_format == "pileup": iterator = pysam.Pileup.iterate(sys.stdin) elif options.input_format == "vcf": if not options.vcf_sample: raise ValueError( "vcf format requires sample id (--vcf-sample) to be set") if not options.filename_vcf: raise ValueError( "reading from vcf requires vcf filename (--filename-vcf) to be set)" ) iterator = pysam.Pileup.iterate_from_vcf(options.filename_vcf, options.vcf_sample) modules = [] modules.append(BaseAnnotatorSNP()) if options.filename_exons: modules.append(BaseAnnotatorExons(options.filename_exons, fasta=fasta)) if options.filename_annotations: modules.append( BaseAnnotatorCodon(options.filename_annotations, fasta=fasta, junctions=junctions)) if options.filename_junctions: modules.append( BaseAnnotatorSpliceSites(options.filename_junctions, fasta=fasta)) options.stdout.write("\t".join([x.getHeader() for x in modules]) + "\n") for snp in iterator: ninput += 1 # translate chromosome according to fasta if fasta: try: snp = snp._replace(chromosome=fasta.getToken(snp.chromosome)) except KeyError: E.warn("unknown contig `%s` for snp `%s`" % (snp.chromosome, str(snp))) continue for module in modules: module.update(snp) options.stdout.write("\t".join(map(str, modules)) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.stop()
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-a", "--annotations-tsv-file", dest="filename_annotations", type="string", help= "filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default]." ) parser.add_option("-r", "--resolution", dest="resolution", type="int", help="resolution of count vector [default=%default].") parser.add_option( "-b", "--num-bins", dest="num_bins", type="int", help="number of bins in count vector [default=%default].") parser.add_option("-i", "--num-samples", dest="num_samples", type="int", help="sample size to compute [default=%default].") parser.add_option( "-w", "--workspace-bed-file", dest="filename_workspace", type="string", help="filename with workspace information [default=%default].") parser.add_option( "--workspace-builder", dest="workspace_builder", type="choice", choices=("gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"), help="given a gff/gtf file build a workspace [default=%default].") parser.add_option( "--workspace-labels", dest="workspace_labels", type="choice", choices=("none", "direction", "annotation"), help="labels to use for the workspace workspace [default=%default].") parser.add_option( "--sampler", dest="sampler", type="choice", choices=("permutation", "gaps"), help= "sampler to use. The sampler determines the null model of how segments are distributed in the workspace [default=%default]" ) parser.add_option( "--counter", dest="counters", type="choice", action="append", choices=("transcription", "closest-distance", "all-distances"), help= "counter to use. The counter computes the quantity of interest [default=%default]" ) parser.add_option("--analysis", dest="analysis", type="choice", action="append", choices=("proximity", "area-under-curve"), help="analysis to perform [default=%default]") parser.add_option("--transform-counts", dest="transform_counts", type="choice", choices=("raw", "cumulative"), help="cumulate counts [default=%default].") parser.add_option( "-s", "--segments", dest="filename_segments", type="string", help="filename with segment information [default=%default].") parser.add_option("--xrange", dest="xrange", type="string", help="xrange to plot [default=%default]") parser.add_option("-o", "--logscale", dest="logscale", type="string", help="use logscale on x, y or xy [default=%default]") parser.add_option("-p", "--plot", dest="plot", action="store_true", help="output plots [default=%default]") parser.add_option("--hardcopy", dest="hardcopy", type="string", help="output hardcopies to file [default=%default]") parser.add_option("--no-fdr", dest="do_fdr", action="store_false", help="do not compute FDR rates [default=%default]") parser.add_option("--segments-format", dest="segments_format", type="choice", choices=("gtf", "bed"), help="format of segments file [default=%default].") parser.add_option( "--truncate", dest="truncate", action="store_true", help="truncate segments extending beyond a workspace [default=%default]" ) parser.add_option( "--remove-overhangs", dest="remove_overhangs", action="store_true", help="remove segments extending beyond a workspace[default=%default]") parser.add_option( "--keep-ambiguous", dest="keep_ambiguous", action="store_true", help= "keep segments extending to more than one workspace [default=%default]" ) parser.set_defaults( filename_annotations=None, filename_workspace="workspace.gff", filename_segments="FastDown.gtf", filename_annotations_gtf="../data/tg1_territories.gff", workspace_builder="gff", workspace_labels="none", sampler="permutation", truncate=False, num_bins=10000, num_samples=10, resolution=100, plot_samples=False, plot_envelope=True, counters=[], transform_counts="raw", xrange=None, plot=False, logscale=None, output_all=False, do_test=False, analysis=[], do_fdr=True, hardcopy="%s.png", segments_format="gtf", remove_overhangs=False, ) (options, args) = E.start(parser, argv=argv, add_output_options=True) ########################################### # setup options if options.sampler == "permutation": sampler = SamplerPermutation elif options.sampler == "gaps": sampler = SamplerGaps if options.xrange: options.xrange = list(map(float, options.xrange.split(","))) if len(options.counters) == 0: raise ValueError("please specify at least one counter.") if len(options.analysis) == 0: raise ValueError("please specify at least one analysis.") if options.workspace_labels == "annotation" and not options.filename_annotations: raise ValueError( "please specify --annotations-tsv-file is --workspace-labels=annotations." ) ########################################### # read data if options.workspace_labels == "annotation": def constant_factory(value): return itertools.repeat(value).__next__ def dicttype(): return collections.defaultdict(constant_factory(("unknown", ))) map_id2annotations = IOTools.readMultiMap(open( options.filename_annotations, "r"), dtype=dicttype) else: map_id2annotations = {} workspace = readWorkspace(open(options.filename_workspace, "r"), options.workspace_builder, options.workspace_labels, map_id2annotations) E.info("read workspace for %i contigs" % (len(workspace))) indexed_workspace = indexIntervals(workspace, with_values=True) segments = readSegments(open(options.filename_segments, "r"), indexed_workspace, format=options.segments_format, keep_ambiguous=options.keep_ambiguous, truncate=options.truncate, remove_overhangs=options.remove_overhangs) nsegments = 0 for contig, vv in segments.items(): nsegments += len(vv) E.info("read %i segments for %i contigs" % (nsegments, len(workspace))) indexed_segments = indexIntervals(segments, with_values=False) if nsegments == 0: E.warn("no segments read - no computation done.") E.stop() return # build labels labels = collections.defaultdict(int) for contig, vv in workspace.items(): for start, end, v in vv: for l in v[0]: labels[l] += 1 for l in v[1]: labels[l] += 1 E.info("found %i workspace labels" % len(labels)) ########################################### # setup counting containers counters = [] for cc in options.counters: if cc == "transcription": counter = CounterTranscription elif cc == "closest-distance": counter = CounterClosestDistance elif cc == "all-distances": counter = CounterAllDistances if nsegments < 256: dtype = numpy.uint8 elif nsegments < 65536: dtype = numpy.uint16 elif nsegments < 4294967296: dtype = numpy.uint32 else: dtype = numpy.int E.debug("choosen dtype %s" % str(dtype)) E.info("samples space is %i bases: %i bins at %i resolution" % ( options.num_bins * options.resolution, options.num_bins, options.resolution, )) E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" % ( options.num_bins * len(labels) * dtype().itemsize * (options.num_samples + 1), len(labels), options.num_samples, options.num_bins, )) c = CountingResults(labels) c.mObservedCounts = counter(labels, options.num_bins, options.resolution, dtype=dtype) simulated_counts = [] for x in range(options.num_samples): simulated_counts.append( counter(labels, options.num_bins, options.resolution, dtype=dtype)) c.mSimulatedCounts = simulated_counts c.mName = c.mObservedCounts.mName counters.append(c) E.info("allocated memory successfully") segments_per_workspace = [] segment_sizes = [] segments_per_label = collections.defaultdict(int) workspaces_per_label = collections.defaultdict(int) ############################################ # get observed and simpulated counts nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0 iteration2 = 0 for contig, vv in workspace.items(): iteration2 += 1 E.info("counting %i/%i: %s %i segments" % (iteration2, len(workspace), contig, len(vv))) if len(vv) == 0: continue iteration1 = 0 for work_start, work_end, v in vv: left_labels, right_labels = v[0], v[1] iteration1 += 1 # ignore empty segments if contig not in indexed_segments: nempty_contigs += 1 continue r = indexed_segments[contig].find(work_start, work_end) segments_per_workspace.append(len(r)) if not r: nempty_workspaces += 1 continue # collect segments and stats nworkspaces += 1 observed = [(x.start, x.end) for x in r] observed.sort() segments_per_workspace.append(len(observed)) segment_sizes.extend([x[1] - x[0] for x in observed]) # collect basic counts for label in list(left_labels) + list(right_labels): workspaces_per_label[label] += 1 segments_per_label[label] += len(observed) # add observed counts for counter in counters: counter.mObservedCounts.addCounts(observed, work_start, work_end, left_labels, right_labels) # create sampler s = sampler(observed, work_start, work_end) # add simulated counts for iteration in range(options.num_samples): simulated = s.sample() for counter in counters: counter.mSimulatedCounts[iteration].addCounts( simulated, work_start, work_end, left_labels, right_labels) E.info("counting finished") E.info( "nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" % (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs)) ###################################################### # transform counts if options.transform_counts == "cumulative": transform = cumulative_transform elif options.transform_counts == "raw": transform = normalize_transform #################################################### # analysis if "proximity" in options.analysis: outfile_proximity = E.openOutputFile("proximity") outfile_proximity.write("\t".join( ("label", "observed", "pvalue", "expected", "CIlower", "CIupper", "qvalue", "segments", "workspaces")) + "\n") else: outfile_proximity = None if "area-under-curve" in options.analysis: outfile_auc = E.openOutputFile("auc") outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n") else: outfile_auc = None # qvalue: expected false positives at p-value # qvalue = expected false positives / if options.do_fdr: E.info("computing pvalues for fdr") for counter in counters: for label in labels: E.info("working on counter:%s label:%s" % (counter, label)) # collect all P-Values of simulated results to compute FDR sim_pvalues = [] medians = counter.getMedians(label) for median in medians: pvalue = float( scipy.stats.percentileofscore(medians, median)) / 100.0 sim_pvalues.append(pvalue) sim_pvalues.sort() else: sim_pvalues = [] # compute observed p-values for counter in counters: counter.update() obs_pvalues = [] for counter in counters: for label in labels: obs_pvalues.append(counter.mStats[label].pvalue) obs_pvalues.sort() # compute observed p-values if options.do_fdr: for counter in counters: counter.updateFDR(obs_pvalues, sim_pvalues) for counter in counters: outofbounds_sim, totals_sim = 0, 0 outofbounds_obs, totals_obs = 0, 0 for label in labels: for sample in range(options.num_samples): if counter.mSimulatedCounts[sample].mOutOfBounds[label]: E.debug( "out of bounds: sample %i, label %s, counts=%i" % (sample, label, counter.mSimulatedCounts[sample].mOutOfBounds[label])) outofbounds_sim += counter.mSimulatedCounts[ sample].mOutOfBounds[label] totals_sim += counter.mSimulatedCounts[sample].mTotals[label] outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label] totals_obs += counter.mObservedCounts.mTotals[label] E.info( "out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)" % ( outofbounds_obs, totals_obs, 100.0 * outofbounds_obs / totals_obs, outofbounds_sim, totals_sim, 100.0 * outofbounds_sim / totals_sim, )) for label in labels: if outfile_auc: mmin, mmax, mmean = counter.getEnvelope( label, transform=normalize_transform) obs = normalize_transform( counter.mObservedCounts[label], counter.mObservedCounts.mOutOfBounds[label]) def block_iterator(a1, a2, a3, num_bins): x = 0 while x < num_bins: while x < num_bins and a1[x] <= a2[x]: x += 1 start = x while x < options.num_bins and a1[x] > a2[x]: x += 1 end = x total_a1 = a1[start:end].sum() total_a3 = a3[start:end].sum() if total_a1 > total_a3: yield (total_a1 - total_a3, start, end, total_a1, total_a3) blocks = list( block_iterator(obs, mmax, mmean, options.num_bins)) if options.output_all: for delta, start, end, total_obs, total_mean in blocks: if end - start <= 1: continue outfile_auc.write( "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" % (label, start * options.resolution, end * options.resolution, (end - start) * options.resolution, total_obs, total_mean, delta, total_obs / total_mean, 100.0 * (total_obs / total_mean - 1.0))) # output best block blocks.sort() delta, start, end, total_obs, total_mean = blocks[-1] outfile_auc.write( "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" % (label, start * options.resolution, end * options.resolution, (end - start) * options.resolution, total_obs, total_mean, delta, total_obs / total_mean, 100.0 * (total_obs / total_mean - 1.0))) if outfile_proximity: # find error bars at median st = counter.mStats[label] outfile_proximity.write( "%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" % ( label, st.observed * options.resolution, st.pvalue, st.expected * options.resolution, st.ci95lower * options.resolution, st.ci95upper * options.resolution, IOTools.val2str(st.qvalue), segments_per_label[label], workspaces_per_label[label], )) if options.plot: for counter in counters: plotCounts(counter, options, transform) # plot summary stats plt.figure() plt.title("distribution of workspace length") data = [] for contig, segs in workspace.items(): if len(segs) == 0: continue data.extend([x[1] - x[0] for x in segs]) vals, bins = numpy.histogram(data, bins=numpy.arange(0, max(data), 100), new=True) t = float(sum(vals)) plt.plot(bins[:-1], numpy.cumsum(vals) / t) plt.gca().set_xscale('log') plt.legend() t = float(sum(vals)) plt.xlabel("size of workspace") plt.ylabel("cumulative relative frequency") if options.hardcopy: plt.savefig(os.path.expanduser(options.hardcopy % "workspace_size")) plt.figure() plt.title("segments per block") vals, bins = numpy.histogram(segments_per_workspace, bins=numpy.arange( 0, max(segments_per_workspace), 1), new=True) plt.plot(bins[:-1], vals) plt.xlabel("segments per block") plt.ylabel("absolute frequency") if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "segments_per_block")) plt.figure() plt.title("workspaces per label") plt.barh(list(range(0, len(labels))), [workspaces_per_label[x] for x in labels], height=0.5) plt.yticks(list(range(0, len(labels))), labels) plt.ylabel("workspaces per label") plt.xlabel("absolute frequency") plt.gca().set_xscale('log') if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "workspaces_per_label")) plt.figure() plt.title("segments per label") plt.barh(list(range(0, len(labels))), [segments_per_label[x] for x in labels], height=0.5) plt.yticks(list(range(0, len(labels))), labels) plt.ylabel("segments per label") plt.xlabel("absolute frequency") plt.xticks(list(range(0, len(labels))), labels) if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "segments_per_label")) if not options.hardcopy: plt.show() E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-f", "--input-format", dest="input_format", type="choice", choices=("bam", ), help="input file format [default=%default].") parser.add_option("-w", "--window-size", dest="window_size", type="int", help="window size [default=%default].") parser.add_option("-c", "--control-filename", dest="control_filename", type="string", help="filename of input/control data in " "bed format [default=%default].") parser.add_option("-t", "--threads", dest="threads", type="int", help="number of threads to use [default=%default].") parser.add_option("-q", "--fdr-threshold", dest="fdr_threshold", type="float", help="fdr threshold [default=%default].") parser.add_option("-z", "--spp-z-threshold", dest="z_threshold", type="float", help="z threshold [default=%default].") parser.add_option("--bin", dest="bin", type="int", help="bin tags within the specified number " " of basepairs to speed up calculation;" " increasing bin size decreases the accuracy " "of the determined parameters [default=%default]") parser.add_option("--spp-srange-min", dest="srange_min", type="float", help="srange gives the possible range for the " " size of the protected region;" " srange should be higher than tag length; " " making the upper boundary too high" " will increase calculation time [%default]") parser.add_option("--spp-srange-max", dest="srange_max", type="float", help="srange gives the possible range for the " " size of the protected region;" " srange should be higher than tag length; " " making the upper boundary too high" " will increase calculation time [%default]") parser.set_defaults( input_format="bam", threads=1, fdr_threshold=0.05, window_size=1000, offset=125, srange_min=50, srange_max=500, bin=5, z_threshold=3, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 2: raise ValueError( "please specify a filename with sample data and an output file") filename_sample, filename_output = args[0], args[1] filename_control = options.control_filename # load Zinba R.library('spp') R.library('snow') # read data E.info("reading data") R('''chip.data <- read.bam.tags('%s')''' % filename_sample) R('''input.data <- read.bam.tags('%s')''' % filename_control) R('''cluster = makeCluster( %i )''' % (options.threads)) E.info("computing binding characteristics") # get binding info from cross-correlation profile # srange gives the possible range for the size of the protected region; # srange should be higher than tag length; making the upper boundary too # high will increase calculation time # bin - bin tags within the specified number of basepairs to speed # up calculation; increasing bin size decreases the accuracy of # the determined parameters srange_min, srange_max = options.srange_min, options.srange_max bin = options.bin R('''binding.characteristics <- get.binding.characteristics(chip.data, srange=c(%(srange_min)i,%(srange_max)i), bin=%(bin)s, cluster=cluster);''' % locals()) # print out binding peak separation distance options.stdout.write("shift\t%i\n" % R('''binding.characteristics$peak$x''')[0]) ################################################## ################################################## ################################################## E.info("plot cross correlation profile") # plot cross-correlation profile R('''pdf(file="%s.crosscorrelation.pdf",width=5,height=5)''' % filename_output) R('''par(mar = c(3.5,3.5,1.0,0.5), mgp = c(2,0.65,0), cex = 0.8);''') R('''plot(binding.characteristics$cross.correlation, type='l', xlab="strand shift", ylab="cross-correlation");''') R('''abline(v=binding.characteristics$peak$x,lty=2,col=2)''') R('''dev.off();''') E.info("selecting informative tags based on the binding characteristics") # select informative tags based on the binding characteristics R('''chip.data <- select.informative.tags( chip.data,binding.characteristics);''') R('''input.data <- select.informative.tags( input.data,binding.characteristics);''') E.info("outputting broad peaks") window_size, z_threshold = options.window_size, options.z_threshold R('''broad.clusters <- get.broad.enrichment.clusters(chip.data,input.data, window.size=%(window_size)i, z.thr=%(z_threshold)f, tag.shift=round(binding.characteristics$peak$x/2))''' % locals()) # write out in broadPeak format R('''write.broadpeak.info(broad.clusters,"%s.broadpeak.txt")''' % filename_output) # binding detection parameters desired FDR (1%). Alternatively, an # E-value can be supplied to the method calls below instead of the # fdr parameter the binding.characteristics contains the optimized # half-size for binding detection window R('''detection.window.halfsize <- binding.characteristics$whs;''') # determine binding positions using wtd method E.info("determining binding positions using wtd method") fdr = options.fdr_threshold R('''bp <- find.binding.positions( signal.data=chip.data,control.data=input.data, fdr=%(fdr)f,whs=detection.window.halfsize,cluster=cluster)''' % locals()) options.stdout.write( "detected_peaks\t%i\n" % R('''sum(unlist(lapply(bp$npl,function(d) length(d$x))))''')[0]) # output detected binding positions R('''output.binding.results(bp,"%s.summit.txt");''' % filename_output) R('''bp <- add.broad.peak.regions(chip.data,input.data,bp, window.size=%(window_size)i,z.thr=%(z_threshold)f)''' % locals()) # output using narrowPeak format R('''write.narrowpeak.binding(bp,"%s.narrowpeak.txt")''' % filename_output) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=('reconcile', ), help="method to apply [default=%default].") parser.add_option("-c", "--chop-identifier", dest="chop", action="store_true", help="whether or not to trim last character of the " "sequence name. For example sometimes ids in the first " "file in the pair will end with \1 and the second " "with \2. If --chop-identifier is not specified " "then the results will be wrong [default=%default].") parser.add_option("-u", "--unpaired", dest="unpaired", action="store_true", help="whether or not to write out unpaired reads " "to a separate file") parser.add_option("--id-pattern-1", dest="id_pattern_1", help="If specified will use the first group from the" "pattern to determine the ID for the first read", default=None) parser.add_option("--id-pattern-2", dest="id_pattern_2", help="As above but for read 2", default=None) parser.add_option("-o", "--output-filename-pattern", dest="output_pattern", type="string", help="pattern for output files [default=%default].") parser.set_defaults( method="reconcile", chop=False, unpaired=False, output_pattern="%s.fastq.gz", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = args c = E.Counter() if options.id_pattern_1: id1_getter = PatternGetter(options.id_pattern_1) else: id1_getter = plain_getter if options.id_pattern_2: id2_getter = PatternGetter(options.id_pattern_2) else: id2_getter = plain_getter if options.method == "reconcile": # IMS: switching to no store second set of read names and only use # lazily. Since generators don't have a size must keep track id_lengths = {fn1: 0, fn2: 0} def getIds(infile, id_getter=plain_getter): '''return ids in infile.''' aread = infile.readline while True: l = [aread().rstrip("\r\n") for i in range(4)] if not l[0]: break r = id_getter(l[0].split()[0]) # decide if to chop read number off id_lengths[infile.name] += 1 if options.chop: yield r[:-1] else: yield r def write(outfile, infile, take, unpaired_file=None, id_getter=plain_getter): '''filter fastq files with ids in take.''' aread = infile.readline while True: l = [aread().rstrip("\r\n") for i in range(4)] if not l[0]: break r = id_getter(l[0].split()[0]) if options.chop: r = r[:-1] if r not in take: if unpaired_file is None: continue else: unpaired_file.write("\n".join(l) + "\n") else: outfile.write("\n".join(l) + "\n") E.info("reading first in pair") inf1 = IOTools.open_file(fn1) ids1 = set(getIds(inf1, id1_getter)) E.info("reading second in pair") inf2 = IOTools.open_file(fn2) # IMS: No longer keep as a set, but lazily evaluate into intersection # leads to large memory saving for large inf2, particularly if # inf1 is small. ids2 = getIds(inf2, id2_getter) take = ids1.intersection(ids2) E.info("first pair: %i reads, second pair: %i reads, " "shared: %i reads" % (id_lengths[fn1], id_lengths[fn2], len(take))) if options.unpaired: unpaired_filename = IOTools.open_file( options.output_pattern % "unpaired", "w") else: unpaired_filename = None with IOTools.open_file(options.output_pattern % "1", "w") as outf: inf = IOTools.open_file(fn1) E.info("writing first in pair") write(outf, inf, take, unpaired_filename, id1_getter) with IOTools.open_file(options.output_pattern % "2", "w") as outf: inf = IOTools.open_file(fn2) E.info("writing second in pair") write(outf, inf, take, unpaired_filename, id2_getter) if options.unpaired: unpaired_filename.close() # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-d", "--dest", dest="destination", type="string", help="destination directory.") parser.add_option( "-n", "--name", "--set-name", dest="name", type="string", help="name of this pipeline. 'pipeline_' will be prefixed.") parser.add_option("-f", "--force-output", dest="force", action="store_true", help="overwrite existing files.") parser.add_option("-t", "--pipeline-type", dest="pipeline_type", type="choice", choices=("full", "minimal"), help="type of pipeline to output. " "full=a complete pipeline for the CGAT environment " "minimum=minimum pipeline " "[%default]") parser.set_defaults( destination=".", name=None, force=False, pipeline_type="full", ) (options, args) = E.start(parser) if not options.name: raise ValueError("please provide a pipeline name") destination_dir = os.path.abspath(options.destination) reportdir = os.path.join(destination_dir, "src", "pipeline_docs", "pipeline_%s" % options.name) confdir = os.path.join(destination_dir, "src", "pipeline_%s" % (options.name)) # create directories for d in ("", "src", "work", "src/pipeline_docs", "src/pipeline_%s" % options.name, reportdir, "%s/_templates" % reportdir, "%s/pipeline" % reportdir, "%s/trackers" % reportdir): dd = os.path.join(destination_dir, d) if not os.path.exists(dd): os.makedirs(dd) # copy files # replaces all instances of template with options.name within # filenames and inside files. rx_file = re.compile("template") rx_type = re.compile("_%s" % options.pipeline_type) rx_template = re.compile("@template@") rx_reportdir = re.compile("@reportdir@") srcdir = os.path.dirname(__file__) def copy(src, dst, name): # remove "template" and the pipeline type from file/directory # names. fn_dest = os.path.join(destination_dir, dst, rx_type.sub("", rx_file.sub(name, src))) fn_src = os.path.join(srcdir, "pipeline_template_data", src) E.debug("fn_src=%s, fn_dest=%s, src=%s, dest=%s" % (fn_src, fn_dest, src, dst)) if os.path.exists(fn_dest) and not options.force: raise OSError("file %s already exists - not overwriting." % fn_dest) if fn_src.endswith(".png"): shutil.copyfile(fn_src, fn_dest) else: with IOTools.open_file(fn_dest, "w") as outfile: with IOTools.open_file(fn_src) as infile: for line in infile: outfile.write( rx_reportdir.sub(reportdir, rx_template.sub(name, line))) def copytree(src, dst, name): fn_dest = os.path.join(destination_dir, dst, rx_file.sub(name, src)) fn_src = os.path.join(srcdir, "pipeline_template_data", src) if os.path.exists(fn_dest) and not options.force: raise OSError("file %s already exists - not overwriting." % fn_dest) shutil.copytree(fn_src, fn_dest) for f in ("pipeline.yml", ): copy(f, 'src/pipeline_%s' % options.name, name=options.name) # copy the script copy("pipeline_template_%s.py" % options.pipeline_type, 'src', name=options.name) # create links for src, dest in (("pipeline.yml", "pipeline.yml"), ): d = os.path.join(destination_dir, "work", dest) if os.path.exists(d) and options.force: os.unlink(d) os.symlink(os.path.join(confdir, src), d) for f in ("cgat_logo.png", ): copy(f, "%s/_templates" % reportdir, name=options.name) for f in ("themes", ): copytree(f, "src/pipeline_docs", name=options.name) for f in ("contents.rst", "pipeline.rst", "__init__.py"): copy(f, reportdir, name=options.name) for f in ("Dummy.rst", "Methods.rst"): copy(f, "%s/pipeline" % reportdir, name=options.name) for f in ("TemplateReport.py", ): copy(f, "%s/trackers" % reportdir, name=options.name) absdest = os.path.abspath(destination_dir) name = options.name print(""" Welcome to your new %(name)s CGAT pipeline. All files have been successfully copied to `%(destination_dir)s`. In order to start the pipeline, go to `%(destination_dir)s/work` cd %(destination_dir)s/work You can start the pipeline by typing: cgatflow %(name)s -v 5 -p 5 make full The source code for the pipeline is in %(destination_dir)s/src. """ % locals()) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-p", "--pattern-identifier", dest="pattern", type="string", help="jobs matching `pattern` in their job " "description will be killed [default=%default].") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="do dry run, do not kill [default=%default].") parser.set_defaults( pattern=None, dry_run=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) output = StringIO.StringIO( subprocess.Popen(["qstat", "-xml"], stdout=subprocess.PIPE).communicate()[0]) tree = xml.etree.ElementTree.ElementTree(file=output) ntested = 0 to_kill = set() if options.pattern: pattern = re.compile(options.pattern) else: pattern = None for x in tree.getiterator("job_list"): ntested += 1 id = x.find("JB_job_number").text name = x.find("JB_name").text if pattern and pattern.search(name): to_kill.add(id) nkilled = len(to_kill) if not options.dry_run: p = subprocess.Popen( ["qdel", ",".join(to_kill)], stdout=subprocess.PIPE) stdout, stderr = p.communicate() E.info("ntested=%i, nkilled=%i" % (ntested, nkilled)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gff_compare.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--output-full", dest="write_full", help="write full gff entries.", action="store_true") parser.add_option("-e", "--output-matched-exons", dest="write_matched_exons", help="write matched exons.", action="store_true") parser.add_option("-o", "--output-missed-exons", dest="write_missed_exons", action="store_true", help="write missed exons.") parser.add_option("-g", "--output-missed-genes", dest="write_missed_genes", action="store_true", help="write missed genes.") parser.add_option( "-r", "--regex-reference", dest="regex_reference", type="string", help="regular expression mapping exon to transcript in reference.") parser.add_option( "-t", "--regex-target", dest="regex_target", type="string", help="regular expression mapping exon to transcript in target.") parser.add_option("--no-nucleotides", dest="do_nucleotides", action="store_false", help="skip nucleotide benchmark.") parser.add_option("--no-exons", dest="do_exons", action="store_false", help="skip exon benchmark.") parser.add_option("--no-genes", dest="do_genes", action="store_false", help="skip gene benchmark.") parser.add_option( "--output-filename-pattern", dest="outfile_pattern", type="string", help= "output filename pattern for extra info (%s will be substituted with reference,target)." ) parser.set_defaults( remove_redundancy=False, max_exon_slippage=9, write_missed_exons=False, write_matched_exons=False, write_missed_genes=False, write_wrong_exons=False, write_wrong_genes=False, do_nucleotides=True, do_exons=True, do_genes=True, regex_reference=None, regex_target=None, outfile_pattern="%s.info", ) (options, args) = E.start(parser) if len(args) != 2: print(USAGE) print("two arguments required") sys.exit(1) input_filename_target, input_filename_reference = args if options.loglevel >= 1: print("# target entries from %s" % input_filename_target) print("# reading target entries ...", end=' ') sys.stdout.flush() gff_targets = GTF.readFromFile(open(input_filename_target, "r")) if options.loglevel >= 1: print("finished: %i" % (len(gff_targets))) sys.stdout.flush() if options.loglevel >= 1: print("# reference entries from %s" % input_filename_reference) print("# reading reference entries ...", end=' ') sys.stdout.flush() gff_references = GTF.readFromFile(open(input_filename_reference, "r")) if options.loglevel >= 1: print("finished: %i" % (len(gff_references))) sys.stdout.flush() if options.remove_redundancy: gff_targets = GTF.CombineOverlaps(gff_targets) gff_references = GTF.CombineOverlaps(gff_references) if options.loglevel >= 1: print("# after filtering: targets=%i, references=%i" % (len(gff_targets), len(gff_references))) ########################################################################## # sort exons if options.loglevel >= 1: print("# sorting exons ...", end=' ') sys.stdout.flush() gff_targets.sort(lambda x, y: cmp((x.mName, x.strand, x.start, x.end), (y.mName, y.strand, y.start, y.end))) gff_references.sort(lambda x, y: cmp((x.mName, x.strand, x.start, x.end), (y.mName, y.strand, y.start, y.end))) ntargets = len(gff_targets) nreferences = len(gff_references) if options.loglevel >= 1: print("finished") sys.stdout.flush() ########################################################################## # get nucleotide level accuracy # process each fragment separately if options.do_nucleotides: print( """############################################################""") headers = ("contig", "strand", "tp", "fp", "tn", "fn", "sp", "sn", "cc") print("\t".join(headers)) first_r, first_t = 0, 0 r, t = 0, 0 ttp, tfp, ttn, tfn = 0, 0, 0, 0 # this only works, if all contigs in reference are present in target. while r < nreferences and t < ntargets: this_name = gff_references[r].mName this_strand = gff_references[r].strand # get all in references while r < nreferences and \ gff_references[r].mName == this_name and \ gff_references[r].strand == this_strand: r += 1 # skip over extra contigs in target while t < ntargets and \ (gff_targets[t].mName != this_name or gff_targets[t].strand != this_strand): t += 1 first_t = t # get all in targets while t < ntargets and \ gff_targets[t].mName == this_name and \ gff_targets[t].strand == this_strand: t += 1 tp, fp, tn, fn = AnalyseOverlaps(gff_references[first_r:r], gff_targets[first_t:t]) spec, sens = CalculateSpecificitySensitivity(tp, fp, tn, fn) cc = CalculateCorrelationCoefficient(tp, fp, tn, fn) print("%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f" % (this_name, this_strand, tp, fp, tn, fn, spec, sens, cc)) ttp += tp tfp += fp ttn += tn tfn += fn first_r, first_t = r, t spec, sens = CalculateSpecificitySensitivity(ttp, tfp, ttn, tfn) cc = CalculateCorrelationCoefficient(ttp, tfp, ttn, tfn) print("%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f" % ("all", "all", ttp, tfp, ttn, tfn, spec, sens, cc)) sys.stdout.flush() ########################################################################## if options.do_exons or options.do_genes: print( """############################################################""") headers = ("category", "contig", "strand", "tp", "fp", "tn", "fn", "sp", "sn", "cc", "me", "we", "me", "we") print("\t".join(headers)) r, t = 0, 0 next_r, next_t = r, t # strict false positves/negatives tp, fp, tn, fn = 0, 0, 0, 0 ttp, tfp, ttn, tfn = 0, 0, 0, 0 # partial false positives/negatives ptp, pfp, ptn, pfn = 0, 0, 0, 0 tptp, tpfp, tptn, tpfn = 0, 0, 0, 0 # missed and wrong exons missed_exons, wrong_exons = 0, 0 tmissed_exons, twrong_exons = 0, 0 # Flag set, if partial overlap in previous pair last_partial_overlap = False # Flag set, if partial overlap and reference was last increased last_increased_ref = False while r < nreferences and t < ntargets: this_name = gff_references[r].mName this_strand = gff_references[r].strand # get overlap segments if next_r == r: ref_overlaps, next_r, ref_start, ref_end = GetFirstOverlaps( gff_references, r) if next_t == t: target_overlaps, next_t, target_start, target_end = GetFirstOverlaps( gff_targets, t) if options.loglevel >= 3: print( "########################################################") for x in ref_overlaps: print("#", str(x)) for x in target_overlaps: print("#", str(x)) do_summary = False # check strand switch in reference if next_r < nreferences and \ (this_name != gff_references[next_r].mName or this_strand != gff_references[next_r].strand): if options.loglevel >= 3: print("# target advance") do_summary = True last_increased_ref = False last_partial_overlap = False # advance in target until next name is found next_name = gff_references[next_r].mName next_strand = gff_references[next_r].strand while next_t < ntargets and \ next_name != gff_targets[next_t].mName or \ next_strand != gff_targets[next_t].strand: fp += 1 pfp += 1 target_overlaps, next_t, target_start, target_end = GetFirstOverlaps( gff_targets, next_t) for x in gff_targets[t:next_t]: x.mStatus = "extra" for x in gff_references[r:next_r]: x.mStatus = "extra" r, t = next_r, next_t # check strand switch in target elif next_t < ntargets and \ (this_name != gff_targets[next_t].mName or this_strand != gff_targets[next_t].strand): # advance in reference until next name is found if options.loglevel >= 3: print("# reference advance") do_summary = True last_increased_ref = False last_partial_overlap = False next_name = gff_targets[next_t].mName next_strand = gff_targets[next_t].strand while next_r < nreferences and \ next_name != gff_references[next_r].mName or \ next_strand != gff_references[next_r].strand: fn += 1 pfn += 1 reference_overlaps, next_r, references_start, references_end = GetFirstOverlaps( gff_references, next_r) for x in gff_targets[t:next_t]: x.mStatus = "extra" for x in gff_references[r:next_r]: x.mStatus = "extra" r, t = next_r, next_t # otherwise else: ref_status, target_status = None, None if options.loglevel >= 3: print("# same chromosome") # overlap between segments if min(ref_end, target_end) - max(ref_start, target_start) > 0: # clear flags last_increased_ref = False last_partial_overlap = False found = False for rr in ref_overlaps: xfound = False for tt in target_overlaps: if GTF.Identity( rr, tt, max_slippage=options.max_exon_slippage): xfound = True break if xfound: found = True break if found: ref_status = "match" target_status = "match" tp += 1 ptp += 1 if options.write_matched_exons: print( "############# matching exons ###########################" ) for x in ref_overlaps: print("#", str(x)) for x in target_overlaps: print("#", str(x)) else: fn += 1 # check for one-sided matches for rr in ref_overlaps: xfound = False for tt in target_overlaps: if GTF.HalfIdentity(rr, tt, max_slippage=options. max_exon_slippage): xfound = True break if xfound: found = True break if found: ptp += 1 code = "partial" ref_status = "partial" target_status = "partial" else: pfn += 1 code = "complete" ref_status = "mismatch" target_status = "mismatch" if options.write_missed_exons: print( "############# %s non-overlapping exons ###########################" % code) for x in ref_overlaps: print("#", str(x)) for x in target_overlaps: print("#", str(x)) ########################################################### # r, t = next_r, next_t if ref_end == target_end: r, t = next_r, next_t elif ref_end < target_end: r = next_r last_increased_ref = True last_partial_overlap = True else: t = next_t last_increased_ref = False last_partial_overlap = True # non-overlap between segments else: if ref_end < target_start: # for non-overlap, check whether there was partial overlap before # and reference was not increased. # if there was, just increment reference, but do not # count. if not (last_partial_overlap and not last_increased_ref): if options.write_missed_exons: print( "############# missed exon ###########################" ) for x in ref_overlaps: print("#", str(x)) missed_exons += 1 fn += 1 pfn += 1 ref_status = "extra" r = next_r else: # for non-overlap, check whether there was partial overlap before # and target was not increased. # if there was, just increment target, but do not # count. if not (last_partial_overlap and last_increased_ref): if options.write_wrong_exons: print( "############# wrong exon ###########################" ) for x in target_overlaps: print("#", str(x)) wrong_exons += 1 fp += 1 pfp += 1 target_status = "extra" t = next_t last_partial_overlap = False if options.loglevel >= 3: print("# ref_status=%s, target_status=%s" % (ref_status, target_status)) if ref_status: for rr in ref_overlaps: rr.mStatus = ref_status if ref_status in ("match", "partial") and options.do_genes: for rr in ref_overlaps: rr.mMatches = target_overlaps if target_status: for tt in target_overlaps: tt.mStatus = target_status if target_status in ("match", "partial") and options.do_genes: for tt in target_overlaps: tt.mMatches = ref_overlaps if do_summary or r >= nreferences or t >= ntargets: ttp += tp tfp += fp ttn += tn tfn += fn tptp += ptp tpfp += pfp tptn += ptn tpfn += pfn tmissed_exons += missed_exons twrong_exons += wrong_exons if tp + fn != 0: pmissed_exons = "%5.2f" % (float(missed_exons) / (tp + fn)) else: pmissed_exons = "0" if tp + fp != 0: pwrong_exons = "%5.2f" % (float(wrong_exons) / (tp + fp)) else: pwrong_exons = "na" spec, sens = CalculateSpecificitySensitivity(tp, fp, tn, fn) cc = (spec + sens) / 2.0 print( "full\t%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%s\t%s" % (this_name, this_strand, tp, fp, tn, fn, spec, sens, cc, missed_exons, wrong_exons, pmissed_exons, pwrong_exons)) spec, sens = CalculateSpecificitySensitivity( ptp, pfp, ptn, pfn) cc = (spec + sens) / 2.0 print( "half\t%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%s\t%s" % (this_name, this_strand, ptp, pfp, ptn, pfn, spec, sens, cc, missed_exons, wrong_exons, pmissed_exons, pwrong_exons)) tp, fp, tn, fn = 0, 0, 0, 0 ptp, pfp, ptn, pfn = 0, 0, 0, 0 missed_exons, wrong_exons = 0, 0 if t < ntargets: for x in gff_targets[t:ntargets]: x.mStatus = "extra" if r < nreferences: for x in gff_references[r:nreferences]: x.mStatus = "extra" spec, sens = CalculateSpecificitySensitivity(ttp, tfp, ttn, tfn) cc = (spec + sens) / 2.0 print( "full\t%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f" % ("all", "all", ttp, tfp, ttn, tfn, spec, sens, cc, tmissed_exons, twrong_exons, float(tmissed_exons) / (ttp + tfn), float(twrong_exons) / (ttp + tfp))) spec, sens = CalculateSpecificitySensitivity(tptp, tpfp, tptn, tpfn) cc = (spec + sens) / 2.0 print( "half\t%s\t%s\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f" % ("all", "all", tptp, tpfp, tptn, tpfn, spec, sens, cc, tmissed_exons, twrong_exons, float(tmissed_exons) / (ttp + tfn), float(twrong_exons) / (ttp + tfp))) if options.do_genes and \ options.regex_reference and \ options.regex_target: print( """###############################################################""" ) out_options = [] if options.write_missed_genes: out_options.append("missed") if options.loglevel >= 2: print("# counting matches for reference.") sys.stdout.flush() (ref_total, ref_match, ref_partial, ref_extra) =\ CountMatchesPerGene(gff_references, re.compile(options.regex_reference), re.compile(options.regex_target), write=out_options, outfile=open(options.outfile_pattern % "reference", "w")) if options.loglevel >= 2: print("# counting matches for target.") sys.stdout.flush() (target_total, target_match, target_partial, target_extra) =\ CountMatchesPerGene(gff_targets, re.compile(options.regex_target), re.compile( options.regex_reference), write=out_options, outfile=open(options.outfile_pattern % "target", "w")) if options.loglevel >= 1: print( "# reference: genes=%6i, matches=%6i, partial=%6i, extra=%6i" % (ref_total, ref_match, ref_partial, ref_extra)) print( "# target : genes=%6i, matches=%6i, partial=%6i, extra=%6i" % (target_total, target_match, target_partial, target_extra)) headers = ("category", "tp", "fp", "tn", "fn", "sp", "sn", "cc", "mg", "wg", "mg", "wg") print("\t".join(headers)) tp = ref_match fp = target_extra tn = 0 fn = ref_total - ref_match wrong_genes = target_extra missed_genes = ref_extra spec, sens = CalculateSpecificitySensitivity(tp, fp, tn, fn) cc = (spec + sens) / 2.0 if tp + fp == 0: fp = nreferences print( "full\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f" % (tp, fp, tn, fn, spec, sens, cc, missed_genes, wrong_genes, float(missed_genes) / (tp + fn), float(wrong_genes) / (tp + fp))) tp = ref_match + ref_partial fp = target_extra tn = 0 fn = ref_total - ref_match - ref_partial wrong_genes = target_extra missed_genes = ref_extra spec, sens = CalculateSpecificitySensitivity(tp, fp, tn, fn) cc = (spec + sens) / 2.0 print( "half\t%i\t%i\t%i\t%i\t%5.2f\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f" % (tp, fp, tn, fn, spec, sens, cc, missed_genes, wrong_genes, float(missed_genes) / (tp + fn), float(wrong_genes) / (tp + fp))) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--input-bed-file", dest="input_bed_file", type="string", help="input file with intervals. Tab-delimited file of intervals " "in bed format to restrict analysis to. [%default]") parser.add_option( "-m", "--merge-intervals", dest="merge_intervals", action="store_true", help="merge intervals in bed file. Useful if you have a site bed-file " "[%default]") parser.add_option("-f", "--reference-fasta-file", dest="reference_fasta_file", help="reference genomic sequence in fasta format. " "[%default]") parser.add_option( "-c", "--barcode-fasta-file", dest="barcode_fasta_file", help="barcode sequence in fasta format. Variable positions " "should be marked by N " "[%default]") parser.set_defaults( reference_fasta_file=None, barcode_fasta_file=None, merge_intervals=False, input_bed_file=None, anchor=5, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if options.stdin != sys.stdin: bamfile = options.stdin.name elif args: if len(args) > 1: raise ValueError("multiple bam files provided in arguments") bamfile = args[0] else: bamfile = "-" if options.barcode_fasta_file: with pysam.FastxFile(options.barcode_fasta_file) as inf: barcode_sequence = next(inf).sequence else: barcode_sequence = None if not os.path.exists(options.reference_fasta_file): raise OSError("reference fasta file {} does not exist".format( options.reference_fasta_file)) if not os.path.exists(options.input_bed_file): raise OSError("input bed file {} does not exist".format( options.input_bed_file)) bed_in = pysam.TabixFile(options.input_bed_file) pysam_in = pysam.AlignmentFile(bamfile) anchor = options.anchor for region_idx, vals in enumerate( iterate_bed(bed_in, options.merge_intervals)): if region_idx > 0: raise NotImplementedError( "output for multiple regions not yet implemented") contig, region_start, region_end = vals upstream_anchors, downstream_anchors = [], [] counter = E.Counter() unaligned_fn = E.get_output_file( "unaligned_{}.fasta".format(region_idx)) with IOTools.open_file(unaligned_fn, "w") as outf: for read in pysam_in.fetch(contig, region_start, region_end): counter.overlapping_reads += 1 try: pairs = read.get_aligned_pairs(with_seq=True) except ValueError: counter.no_md_tag += 1 continue map_ref2read_pos = dict( (x[1], x[0]) for x in pairs if x[0] is not None) map_ref2ref_base = dict( (x[1], x[2]) for x in pairs if x[0] is not None) upstream_anchor = "".join( map_ref2ref_base.get(x, "") for x in range(region_start - anchor, region_start)) downstream_anchor = "".join( map_ref2ref_base.get(x, "") for x in range(region_end, region_end + anchor)) # check if at least one anchor is aligned upstream_matches = sum([x.isupper() for x in upstream_anchor]) downstream_matches = sum( [x.isupper() for x in downstream_anchor]) if upstream_matches < anchor and downstream_matches < anchor: counter.no_anchor += 1 continue seq = read.query_alignment_sequence # collect full length anchors upstream_anchor_start, upstream_anchor_end = region_start - anchor, region_start downstream_anchor_start, downstream_anchor_end = region_end, region_end + anchor if upstream_anchor_start in map_ref2read_pos and upstream_anchor_end in map_ref2read_pos: upstream_anchors.append( seq[map_ref2read_pos[upstream_anchor_start]: map_ref2read_pos[upstream_anchor_end]]) if downstream_anchor_start in map_ref2read_pos and downstream_anchor_end in map_ref2read_pos: downstream_anchors.append( seq[map_ref2read_pos[downstream_anchor_start]: map_ref2read_pos[downstream_anchor_end]]) # get region to align read_start = min( (map_ref2read_pos.get(x, len(seq)) for x in range(region_start - anchor, region_start))) if read_start == len(seq): read_start = 0 read_end = max( (map_ref2read_pos.get(x, 0) + 1 for x in range(region_end, region_end + anchor))) if read_end == 1: read_end = len(seq) counter.collected_reads += 1 outf.write(">{}/{}-{}\n{}\n".format(read.query_name, read_start, read_end, seq[read_start:read_end])) counter.downstream_anchors = len(downstream_anchors) counter.upstream_anchors = len(upstream_anchors) E.info(counter) if counter.overlapping_reads == 0: E.warn("no sequences overlapping region") continue if counter.downstream_anchors == 0 or counter.upstream_anchors == 0: E.warn("at least one anchor undefined") continue if counter.collected_reads == 1: E.warn("only single sequence, multiple aligment skipped") with IOTools.open_file(unaligned_fn) as inf: stdout = inf.read() else: # G-INS-i -> global alignment algorithm E.info("starting mafft multiple alignment") stdout = E.run( "mafft --globalpair --maxiterate 100 --quiet --op 2 --ep 0.5 {}" .format(unaligned_fn), return_stdout=True) aligned_fn = E.get_output_file("aligned_{}.fasta".format(region_idx)) with IOTools.open_file(aligned_fn, "w") as outf: outf.write(stdout) mali = stdout.splitlines() identifiers = [mali[x] for x in range(0, len(mali), 2)] sequences = [mali[x].upper() for x in range(1, len(mali), 2)] consensus = get_consensus(sequences) E.info("after alignment: consensus={}".format(consensus)) # gap filtering -> remove highly gappy columns consensus = get_consensus(sequences, min_gap_proportion=0.9) E.info("after anchor trimming: consensus={}".format(consensus)) take = [idx for idx, x in enumerate(consensus) if x != "-"] sequences = ["".join([s[x] for x in take]) for s in sequences] consensus = get_consensus(sequences, min_gap_proportion=0.9) E.info("after gap filtering: consensus={}".format(consensus)) # get anchor consensus and chop it off consensus = get_consensus(sequences, ignore_gaps=True) upstream_anchor = get_anchor_consensus(upstream_anchors) downstream_anchor = get_anchor_consensus(downstream_anchors) upstream_anchor_start = consensus.find(upstream_anchor) downstream_anchor_start = consensus.rfind(downstream_anchor) E.info( "anchor consensus (no gaps)={}, upstream={}, downstream={}, upstream_idx={}, downstream_idx={}" .format(consensus, upstream_anchor, downstream_anchor, upstream_anchor_start, downstream_anchor_start)) if upstream_anchor_start < 0 or downstream_anchor_start < 0: E.warn("can't locate anchor, no output produced") continue upstream_anchor_end = upstream_anchor_start + len(upstream_anchor) if upstream_anchor_end >= downstream_anchor_start: E.warn("anchor not in correct order, no output produced") continue sequences = [ x[upstream_anchor_end:downstream_anchor_start] for x in sequences ] consensus = get_consensus(sequences) E.info("after anchor trimming: consensus={}".format(consensus)) truncated_fn = E.get_output_file( "aligned_truncated_{}.fasta".format(region_idx)) with IOTools.open_file(truncated_fn, "w") as outf: outf.write("\n".join("{}\n{}\n".format(x, y) for x, y in zip(identifiers, sequences))) positions = list(zip(*sequences)) bases = ["A", "C", "G", "T"] df = pandas.DataFrame([collections.Counter(x) for x in positions]).fillna(0) for missing_base in [x for x in bases if x not in df.columns]: df[missing_base] = 0 df["gapped_depth"] = df.sum(axis=1) df["depth"] = df[bases].sum(axis=1) df["consensus"] = df[bases].idxmax(axis=1) df["consensus_counts"] = df.lookup(df.index, df.consensus) df["consensus_support"] = df.consensus_counts / df.depth df["offconsensus_counts"] = df.depth - df.consensus_counts df.loc[df.consensus_counts == 0, "consensus"] = "N" df["region_id"] = region_idx # replace "gap" consensus positions with + character alignment = global_align(re.sub("-", "+", consensus), barcode_sequence) E.info("alignment: consensus {}".format(alignment[0])) E.info("alignment: barcode {}".format(alignment[1])) barcode_idx = 0 deleted_barcode_bases = [] rows = [] for c, b in zip(*alignment): if c == "-": deleted_barcode_bases.append(barcode_idx) barcode_idx += 1 elif b == "N": rows.append((barcode_idx, "variable")) barcode_idx += 1 elif b == "-": rows.append(("", "insertion")) elif b == c: rows.append((barcode_idx, "fixed-match")) barcode_idx += 1 else: rows.append((barcode_idx, "fixed-mismatch")) barcode_idx += 1 alignment_df = pandas.DataFrame.from_records( rows, columns=["barcode_pos", "barcode_class"]) assert len(alignment_df) == len(df) df = pandas.concat([df, alignment_df], axis=1) with E.open_output_file("pileup") as outf: df.to_csv(outf, sep="\t", index=True, index_label="position") observed_barcode_sequence = "".join( df[df.barcode_class == "variable"].consensus) headers = df.consensus_support.describe().index eval_df = df.loc[df.barcode_class.isin( ("variable", "fixed-match", "fixed-mismatch")), ] median_consensus_depth = eval_df.consensus_counts.median() # zero stuff out if depth is low if median_consensus_depth <= 2: deleted_barcode_bases = [] outf = options.stdout # modules to recover partial bar-codes outf.write("\t".join( map(str, [ "barcode", "ndeleted_barcode_bases", "deleted_barcode_bases" ] + ["support_{}".format(x) for x in headers] + ["counts_{}".format(x) for x in headers] + ["offcounts_{}".format(x) for x in headers])) + "\n") outf.write("\t".join( map(str, [ observed_barcode_sequence, len(deleted_barcode_bases), ",".join( map(str, deleted_barcode_bases)) ] + eval_df.consensus_support.describe().tolist() + eval_df.consensus_counts.describe().tolist() + eval_df.offconsensus_counts.describe().tolist())) + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--time", dest="timepoints", type="string", help="a comma-separated list of time points measured") parser.add_option("--replicates", dest="reps", type="string", help="a comma-separated list of replicate IDs") parser.add_option("--condition", dest="condition", type="string", help="experimental condition") parser.add_option("--resamples", dest="resamples", type="string", help="number of times to resample replicates to" " generate pseudo datasets") parser.add_option("--input-gtf", dest="gtf_file", type="string", help="reference gtf file") parser.add_option("--output-file-directory", dest="output_dir", type="string", help="directory to output" " resampled files to") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) try: infile = IOTools.open_file(argv[-1], "r") except IOError: infile = options.stdin data_frame = pd.read_table(infile, sep="\t", index_col=0, header=0) time_str = options.timepoints.split(",") time_points = [int(x) for x in time_str] replicates = options.reps.split(",") reps = int(options.resamples) its = [time_str, replicates] midx = pd.MultiIndex.from_product(its, names=['times', 'replicates']) TS.genResampleData(data_frame=data_frame, multiple_index=midx, replicates=reps, sample_reps=replicates, times=time_points, condition=options.condition, ref_gtf=options.gtf_file, out_dir=options.output_dir, seed=int(options.random_seed)) # Write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option( "--task", dest="task", type="choice", choices=["mafs", "penetrance", "detect_duplicates", "allele_diff"], help="task to perform") parser.add_option("--ped-file", dest="ped_file", type="string", help="plink format .ped file") parser.add_option("--map-file", dest="map_file", type="string", help="plink format .map file") parser.add_option("--freq-file", dest="mafs", type="string", help="text file containing populations minor " "allele frequencies of variants. One row per " "variant with ID MAF") parser.add_option("--groups-file", dest="group_file", type="string", help="file containing group labels for individuals " "in the provided ped file") parser.add_option("--ref-label", dest="ref_label", type="string", help="group label to be used as the reference case") parser.add_option("--test-label", dest="test_label", type="string", help="group label to be used as the test case") parser.add_option("--subset", dest="subset", type="choice", choices=["cases", "gender"], help="subset the " "data by either case/control or gender") parser.add_option("--take-last", dest="take", action="store_true", help="if use duplicates will take the last variant, " "default behaviour is to take the first") parser.add_option("--outfile-pattern", dest="out_pattern", type="string", help="outfile pattern to use for finding duplicates " "and triallelic variants") parser.add_option("--snp-set", dest="snp_subset", type="string", help="list of SNPs to include") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) parser.set_defaults(mafs=None, subset=None, take_last=False) if options.task == "mafs": mafs = gwas.countByVariantAllele(options.ped_file, options.map_file) mafs.to_csv(options.stdout, index_col=None, sep="\t") elif options.task == "penetrance": summary, pens = gwas.calcPenetrance(options.ped_file, options.map_file, subset=options.subset, mafs=options.mafs, snpset=options.snp_subset) pens.to_csv(options.stdout, sep="\t", index_label="SNP") summary.to_csv("/".join([os.getcwd(), "penetrance_summary.txt"]), sep="\t", index_label="SNP") elif options.task == "allele_diff": allele_diffs = gwas.calcMaxAlleleFreqDiff( ped_file=options.ped_file, map_file=options.map_file, group_file=options.group_file, test=options.test_label, ref=options.ref_label) allele_diffs.to_csv(options.stdout, sep="\t") elif options.task == "detect_duplicates": # find variants with duplicated position and shared reference # allele indicative of triallelic variants - also same ID # ouput to a filter list infile = argv[-1] dups, tris, oves = gwas.findDuplicateVariants(bim_file=infile, take_last=options.take) if os.path.isabs(options.out_pattern): with open(options.out_pattern + ".triallelic", "w") as otfile: for tvar in tris: otfile.write("%s\n" % tvar) with open(options.out_pattern + ".duplicates", "w") as odfile: for dvar in dups: odfile.write("%s\n" % dvar) with open(options.out_pattern + ".overlapping", "w") as ovfile: for ovar in oves: ovfile.write("%s\n" % ovar) else: outpattern = os.path.abspath(options.out_pattern) with open(outpattern + ".triallelic", "w") as otfile: for tvar in tris: otfile.write("%s\n" % tvar) with open(outpattern + ".duplicates", "w") as odfile: for dvar in dups: odfile.write("%s\n" % dvar) with open(outpattern + ".overlapping", "w") as ovfile: for ovar in oves: ovfile.write("%s\n" % ovar) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-f", "--target-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="set quality scores to format " "[default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--pattern-identifier", dest="pattern", type="string", help="filename prefix [default=%default].") parser.set_defaults(change_format=None, guess_format=None, pattern="%s.gz") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) c = E.Counter() outfile_seq = IOTools.open_file(options.pattern % "csfasta", "w") outfile_qual = IOTools.open_file(options.pattern % "qual", "w") if options.change_format: iter = Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format) else: iter = Fastq.iterate(options.stdin) for record in iter: c.input += 1 outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq)) outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals)) c.output += 1 outfile_seq.close() outfile_qual.close() # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults(input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in CSV.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.get_output_file("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.get_output_file("%s_saturation_estimation.tsv" % fn)) outfile = IOTools.open_file( E.get_output_file("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write("estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write("true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write("nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = IOTools.open_file(E.get_output_file("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join([ "treatment_R%i" % x for x in range(len(options.treatment_files)) ])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join([ "control_R%i" % x for x in range(len(options.control_files)) ])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.get_output_file("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.get_output_file("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.get_output_file("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.get_output_file("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.get_output_file("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.get_output_file("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.get_output_file("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.stop()
def buildOptionParser(argv): if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-f", "--format", dest="format", type="choice", choices=("bam", "bigwig"), help="format of genomic input files for densities " "[%default]") parser.add_option( "-o", "--use-interval", dest="use_interval", action="store_true", help="only count tags that are in interval given " "in bed file. Otherwise, use a fixed width window (see --window-size) " "around peak [%default]") parser.add_option( "-w", "--window-size", dest="window_size", type="int", help="window size in bp on either side of a peak used for getting " "read densities. If ``--window-size`` is 1000, the actual window size" "will be 2kb, 1kb on either side of the peak in an interval" "[%default]") parser.add_option( "-b", "--bin-size", dest="bin_size", type="int", help="bin-size in bp for computing read densities. " "If ``--window-size`` is set to 1000 and ``--bin-size`` to 10, " "there will be 100 bins on either side of a peak. " "[%default]") parser.add_option( "--smooth-method", dest="smooth_method", type="choice", choices=("none", "sum", "sg"), help="smooting method to apply to density data before sampling " "according to ``bin-size``. sg=SavitzkyGolay, sum=sum density in bin, " "none=no smoothing " "[%default]") parser.add_option("-s", "--sort-order", dest="sort_orders", type="choice", action="append", choices=("peak-height", "peak-width", "unsorted", "interval-width", "interval-score"), help="output sort order for matrices. " "[%default]") parser.add_option( "-c", "--control-bam-file", "--control-bigwig-file", action="append", dest="control_files", type="string", help="control file. If given, two peakshapes are computed, " "one for the primary data and one for the control data. " "The control file is centered around the same " "base as the primary file and output in the same " "sort order as the primary profile to all side-by-side. " "comparisons. Multiple control files can be given. The " "control files should have the same format as the " "principal input file " "[%default]") parser.add_option( "-r", "--random-shift", dest="random_shift", action="store_true", help="shift intervals in random direction up/downstream of interval " "[%default]") parser.add_option("-e", "--centring-method", dest="centring_method", type="choice", choices=("reads", "middle"), help="centring method. Available are: " "reads=use density to determine peak, " "middle=use middle of interval " "[%default]") parser.add_option("-n", "--normalize-matrix", dest="normalization", type="choice", choices=("none", "sum"), help="matrix normalisation to perform. " "[%default]") parser.add_option( "--use-strand", dest="strand_specific", action="store_true", help="use strand information in intervals. Intervals on the " "negative strand are flipped " "[%default]") parser.add_option( "-i", "--shift-size", dest="shift", type="int", help="shift for reads. When processing bam files, " "reads will be shifted upstream/downstream by this amount. " "[%default]") parser.set_defaults( bin_size=10, shift=0, window_size=1000, sort_orders=[], centring_method="reads", control_files=[], random_shift=False, strand_specific=False, format="bam", report_step=100, use_interval=False, smooth_method=None, ) return parser
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--score-method", dest="method", type="choice", choices=[ "PICS", "LDscore", "ABF", "R2_rank", "get_eigen", "calc_prior", "credible_set", "summarise" ], help="SNP scoring/prioritisation method to apply.") parser.add_option("--database", dest="database", type="string", help="SQL database containing LD information " "in table format. Expects columns SNP_A, " "SNP_B, R2, BP_A and BP_B (Plink --r2 output)") parser.add_option("--ld-directory", dest="ld_dir", type="string", help="directory containing tabix-index BGZIP " "LD files. Assumes Plink used to calculate LD") parser.add_option("--table-name", dest="table", type="string", help="name of the SQL table containing the LD" "values") parser.add_option("--chromosome", dest="chromosome", type="string", help="chromosome to subset the association results " "file on") parser.add_option("--ld-threshold", dest="ld_threshold", type="float", help="the threshold of LD above which variants will " "be taken forward.") parser.add_option("--rank-threshold", dest="rank_threshold", type="float", help="the threshold in terms of the top n% SNPs to " "output based on the ranking metric. e.g. " "--rank-threshold=0.01 is the top 1% SNPs") parser.add_option("--credible-interval", dest="interval", type="float", help="The credible set interval size to generate the " "credible set of SNPs") parser.add_option("--prior-variance", dest="prior_var", type="float", help="the prior variance used to weight the SNP " "variance") parser.add_option("--fine-map-window", dest="map_window", type="int", help="the region size to included around the index " "SNP as the fine-mapping region.") parser.add_option("--eigen-score-directory", dest="eigen_dir", type="string", help="PATH to directory containing tabix indexed " "eigen score files") parser.add_option("--flat-prior", dest="flat_prior", action="store_true", help="Ignore functional annotation information and " "use an uninformative prior on each SNP") parser.add_option("--snp-set", dest="snp_set", type="string", help="Pre-defined SNP set as a list of SNP IDs." "If used to calculate priors contains column of scores.") parser.add_option( "--distribution", dest="dist", type="choice", choices=["normal", "t", "gamma", "lognormal", "exponential"], help="distribution from which to draw prior " "probabilities") parser.add_option("--distribution-parameters", dest="dist_params", type="string", help="distribution parameters as a comma-separated list") parser.add_option("--lead-snp-id", dest="lead_snp", type="int", help="0-based item number in filename") parser.add_option("--filename-separator", dest="separator", type="string", help="filename separator to extract information") parser.add_option("--snp-column", dest="snp_col", type="int", help="0-based index of SNP ID column number") parser.add_option("--probability-column", dest="prob_col", type="int", help="0-based index of posterior probabilities column" " number") parser.set_defaults( ld_dir=None, dist="normal", dist_params=None, snp_set=None, prior_var=0.04, interval=0.99, eigen_dir=None, map_window=100000, ld_threshold=0.5, database=None, table=None, flat_prior=False, lead_snp=2, separator="_", snp_col=0, prob_col=1, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) infile = argv[-1] if len(infile.split(",")) > 1: pass else: peek = pd.read_table(infile, nrows=5, sep="\s*", header=0) try: if len(peek["TEST"] != "ADD"): clean = False else: clean = True except KeyError: clean = True if options.method == "LDscore": snpscores = gwas.snpPriorityScore(gwas_results=infile, database=options.database, table_name=options.table, chromosome=options.chromosome, ld_dir=options.ld_dir, clean=clean) # take top 1%, all SNPs doesn't achieve anything useful ranks = int(len(snpscores.index) * 0.01) snpscores = snpscores.iloc[:ranks] elif options.method == "PICS": snp_list = {} if options.snp_set and not options.flat_prior: with IOTools.open_file(options.snp_set, "r") as sfile: for line in sfile.readlines(): snp = line.split("\t")[0] try: score = float(line.split("\t")[-1].rstrip("\n")) except ValueError: score = 0 snp_list[snp] = float(score) # get the parameter estimates for the distribution # if they have not been provided if not options.dist_params: dist_params = gwas.estimateDistributionParameters( data=snp_list.values(), distribution=options.dist) else: dist_params = tuple( [float(fx) for fx in options.dist_params.split(",")]) E.info("Calculating priors on SNPs") priors = gwas.calcPriorsOnSnps(snp_list=snp_list, distribution=options.dist, params=dist_params) elif options.snp_set and options.flat_prior: with IOTools.open_file(options.snp_set, "r") as sfile: for line in sfile.readlines(): snp = line.split("\t")[0] snp_list[snp] = 1.0 priors = snp_list else: # allow for no priors or scores to be set, # use of priors will be ignored, # i.e. when prior and likelihood are not from # conjugate distributions priors = None # PICS scores expects the gwas results file to # only contain the region of interest, which # represents an independent association signal # if a SNP has not been genotyped, # but it is in strong LD, it will cause problems # downstream <- only allow SNPs that # are present in the analysis snpscores = gwas.PICSscore(gwas_results=infile, database=options.database, table_name=options.table, chromosome=options.chromosome, priors=priors, clean=clean, ld_dir=options.ld_dir, ld_threshold=options.ld_threshold) snpscores.columns = ["SNP", "PICS"] posterior_sum = 0 snpscores.sort_values(ascending=False, inplace=True) post_snps = [] for snp in snpscores.index: if posterior_sum < 99.0: posterior_sum += snpscores.loc[snp] post_snps.append(snp) else: break snpscores = snpscores.loc[post_snps] snpscores.drop_duplicates(inplace=True) elif options.method == "R2_rank": # rank SNPs based on their LD with the lead # SNP, take the top n% SNPs snpscores = gwas.LdRank(gwas_results=infile, database=options.database, table_name=options.table, ld_dir=options.ld_dir, chromosome=options.chromosome, ld_threshold=options.ld_threshold, top_snps=options.rank_threshold, clean=clean) elif options.method == "ABF": snpscores = gwas.ABFScore(gwas_results=infile, region_size=options.map_window, chromosome=options.chromosome, prior_variance=options.prior_var, clean=clean) elif options.method == "get_eigen": E.info("Fetching Eigen scores") snpscores = gwas.getEigenScores(eigen_dir=options.eigen_dir, bim_file=infile, snp_file=options.snp_set) snpscores = pd.DataFrame(snpscores).T elif options.method == "credible_set": E.info("Creating credible set") snpscores = gwas.makeCredibleSet(probs_file=infile, credible_set=options.interval, lead_snp_indx=options.lead_snp, filename_sep=options.separator, snp_column=options.snp_col, probs_column=options.prob_col) elif options.method == "summarise": E.info("Collating SNP prioritisation resuslts") file_list = infile.split(",") snpscores = gwas.summariseResults(file_list=file_list) snpscores.to_csv(options.stdout, index_label="SNP", sep="\t") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--R-scripts", dest="scripts_r", type="string", help="PATH to R scripts and functions") parser.add_option("--trait1", dest="trait1", type="string", help="name/column header of trait 1 in the " "input data table") parser.add_option("--trait2", dest="trait2", type="string", help="name/column header of trait 2 in the " "input data table") parser.add_option("--snp-list", dest="snp_list", type="string", help="optional list of snps on which to " "restrict analysis.") parser.add_option("--covariates", dest="covars", type="string", help="column headers that refer to covariates " "to adjust primary traits for") parser.add_option("--resamples", dest="resample", type="int", help="number of resamples with replacement " "to use for bootstrapping") parser.add_option("--trait1-model", dest="trait1_mod", type="choice", choices=["logistic", "linear"], help="model to use to fit covariates and trait") parser.add_option("--trait2-model", dest="trait2_mod", type="choice", choices=["logistic", "linear"], help="model to use to fit covariates and trait") parser.set_defaults(resample=999, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) infile = argv[-1] # snp headers are assumed to start with 'rs' # read the dataframe with pandas then push # it into R df = pd.read_table(infile, sep="\t", header=0, index_col=None) E.info("Parsing SNP IDs") if options.snp_list: snp_set = set() with open(options.snp_list, "r") as sfile: snp_list = [s.rstrip("\n") for s in sfile.readlines()] snp_list = set(snp_list) for snp in snp_list: snp_re = re.compile(snp) snp_set.update([sx for sx in df.columns if re.search(snp_re, sx)]) snps = [st for st in snp_set] else: snp_re = re.compile("^rs") snps = [sx for sx in df.columns if re.search(snp_re, sx)] E.info("{} SNPs found in data table".format(len(snps))) out_df = pythonWrapper4Pet(dataframe=df, snps=snps, covars=options.covars, scriptsdir=options.scripts_r, trait1=options.trait1, trait2=options.trait2, model1=options.trait1_mod, model2=options.trait2_mod, resamples=options.resample) out_df.to_csv(options.stdout, sep="\t", index_label="SNP") # write footer and output benchmark information. E.stop()
Type:: python merge_tables.py --help for command line help. Command line options -------------------- ''' import sys import string import CGATCore.Experiment as E parser = E.OptionParser(version="%prog version: $Id$") def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser.add_option("-t", "--table", dest="tables", type="string",
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: fasta2nj.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option( "-m", "--map", dest="filename_map", type="string", help="filename with mapping of species ids to swissprot species ids.") parser.set_defaults( separator="|", filename_map=None, ) (options, args) = E.start(parser) if options.filename_map: map_species2sp = IOTools.ReadMap(open(options.filename_map, "r")) ninput, noutput, nerrors = 0, 0, 0 for line in sys.stdin: if line[0] == ">": ninput += 1 id = re.match(">([^/ \t]+)", line[:-1]).groups()[0] data = id.split(options.separator) species = data[0] if len(data) == 2: gene = data[1] transcript = None elif len(data) >= 3: gene = data[2] transcript = data[1] if map_species2sp: try: species = map_species2sp[species] except IndexError: nerrors += 1 if options.loglevel >= 1: options.stdlog.write("# could not map species %s\n" % species) if transcript: options.stdout.write(">%s_%s GENEID=%s\n" % (transcript, species, gene)) else: options.stdout.write(">%s_%s\n" % (species, gene)) noutput += 1 else: options.stdout.write(line) if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nerrors=%i\n" % (ninput, noutput, nerrors)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: calculate_histogram_2D.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-t", "--titles", dest="titles", action="store_true", help="input data has title in first row [default=%default].") parser.add_option( "--no-titles", dest="titles", action="store_false", help="input data has no title in first row [default=%default].") parser.add_option("-1", "--column1", dest="column1", type="int", help="first column to use [default=%default].") parser.add_option("-2", "--column2", dest="column2", type="int", help="second column to use [default=%default].") parser.add_option("--bin-size1", dest="bin_size1", type="float", help="bin size for first column [default=%default].") parser.add_option("--bin-size2", dest="bin_size2", type="float", help="bin size for second column [default=%default].") parser.set_defaults(column1=1, column2=2, bin_size1=1.0, bin_size2=1.0, titles=True) (options, args) = E.start(parser) options.column1 -= 1 options.column2 -= 1 histograms = [] vals = [] # retrieve histogram lines = [x for x in sys.stdin.readlines() if x[0] != "#"] if options.titles: data = lines[0][:-1].split("\t") print("\t".join( (data[options.column1], data[options.column2], "counts"))) del lines[0] ninput, noutput, nskipped = 0, 0, 0 for l in lines: ninput += 1 data = l[:-1].split("\t") try: val = list( map(string.atof, (data[options.column1], data[options.column2]))) except IndexError: nskipped += 1 continue except ValueError: nskipped += 1 continue vals.append(val) noutput += 1 lines = None h = Histogram2D.Calculate( vals, bin_function=lambda x: (int(x[0] / options.bin_size1), int(x[1] / options.bin_size2))) Histogram2D.Print( h, bin_function=lambda x: (x[0] * options.bin_size1, x[1] * options.bin_size2, x[2])) E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--region", dest="region", type="string", help="region to restrict analysis to [%default]") parser.add_option( "--window-size", dest="window_size", type="int", help="window size to use [%default]") parser.add_option( "--output-all-windows", dest="output_all_windows", action="store_true", help="output all windows. By default, windows without reads are skipped " "[%default]") parser.add_option( "--reference-fasta", "--input-filename-fasta", dest="input_filename_fasta", type="string", help="filename with reference sequence. If given, used to " "compute G+C content in windows [%default]") parser.set_defaults( force_output=False, region=None, output_all_windows=False, window_size=500, input_filename_fasta=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) is_stdin = True if len(args) > 0: pysam_in = pysam.AlignmentFile(args[0], "rb") if args[0] != "-": is_stdin = False elif options.stdin == sys.stdin: pysam_in = pysam.AlignmentFile("-", "rb") else: pysam_in = pysam.AlignmentFile(options.stdin, "rb") if options.stdin != "-": is_stdin = False if options.input_filename_fasta: fasta = pysam.FastaFile(options.input_filename_fasta) else: fasta = None counts_df = bam2stats_window_count( pysam_in, region=options.region, window_size=options.window_size, fasta=fasta) if not options.output_all_windows: counts_df = counts_df[counts_df.alignments > 0] # add G+C content if fasta: counts_df["percent_gc"] = 100.0 * counts_df.bases_gc / (counts_df.bases_gc + counts_df.bases_at) counts_df.fillna(0, inplace=True) counts_df.to_csv( options.stdout, sep="\t") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--summarise", dest="summarise", type="choice", choices=("level-counts", "taxa-counts", "individual"), help="summarise the taxa counts - no. phyla etc") parser.add_option("--output-map", dest="output_map", action="store_true", help="ouput map of taxonomy") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if options.output_map: found = [] options.stdout.write("""Domain\t \ kingdom\t \ phylum\t \ class\t \ order\t \ family\t \ genus\t \ species\n""") # only output the mapping file - do not continue # summarise regardless of the specified options for lca in LCA.iterate(options.stdin): # if bacteria or archaea the kingdom will # be the domain if lca.domain == "Bacteria" or lca.domain == "Archaea": kingdom = lca.domain else: kingdom = lca.kingdom hierarchy = [ lca.domain, kingdom, lca.phylum, lca._class, lca.order, lca.family, lca.genus, lca.species ] if hierarchy in found: continue else: found.append(hierarchy) options.stdout.write("\t".join(hierarchy) + "\n") return if options.summarise == "level-counts": level_counts = collections.defaultdict(set) total = 0 nreads_domain = 0 nreads_kingdom = 0 nreads_kingdom_plus = 0 nreads_phylum = 0 nreads_phylum_plus = 0 nreads_class = 0 nreads_class_plus = 0 nreads_order = 0 nreads_order_plus = 0 nreads_family = 0 nreads_family_plus = 0 nreads_genus = 0 nreads_genus_plus = 0 nreads_species = 0 nreads_species_plus = 0 nreads_subspecies = 0 nreads_subspecies_plus = 0 c = E.Counter() for lca in LCA.iterate(options.stdin): total += 1 if lca.domain != "NA": nreads_domain += 1 level_counts["domain"].add(lca.domain) else: c.kingdom_unmapped += 1 if lca.kingdom != "NA": nreads_kingdom += 1 level_counts["kingdom"].add(lca.kingdom) else: c.kingdom_unmapped += 1 if lca.kingdom_plus != "NA": nreads_kingdom_plus += 1 level_counts["kingdom+"].add(lca.kingdom_plus) else: c.kingdom_plus_unmapped += 1 if lca.phylum != "NA": nreads_phylum += 1 level_counts["phylum"].add(lca.phylum) else: c.phylum_unmapped += 1 if lca.phylum_plus != "NA": nreads_phylum_plus += 1 level_counts["phylum+"].add(lca.phylum_plus) else: c.phylum_plus_unmapped += 1 if lca._class != "NA": nreads_class += 1 level_counts["class"].add(lca._class) else: c.class_unmapped += 1 if lca._class_plus != "NA": nreads_class_plus += 1 level_counts["class+"].add(lca._class_plus) else: c.class_plus_unmapped += 1 if lca.order != "NA": nreads_order += 1 level_counts["order"].add(lca.order) else: c.order_unmapped += 1 if lca.order_plus != "NA": nreads_order_plus += 1 level_counts["order+"].add(lca.order_plus) else: c.order_plus_unmapped += 1 if lca.family != "NA": nreads_family += 1 level_counts["family"].add(lca.family) else: c.family_unmapped += 1 if lca.family != "NA": nreads_family_plus == 1 level_counts["family+"].add(lca.family_plus) else: c.family_plus_unmapped += 1 if lca.genus != "NA": nreads_genus += 1 level_counts["genus"].add(lca.genus) else: c.genus_unmapped += 1 if lca.genus_plus != "NA": nreads_genus_plus == 1 level_counts["genus+"].add(lca.genus_plus) else: c.genus_plus_unmapped += 1 if lca.species != "NA": nreads_species += 1 level_counts["species"].add(lca.species) else: c.species_unmapped += 1 if lca.species_plus != "NA": nreads_species_plus += 1 level_counts["species+"].add(lca.species_plus) else: c.species_plus_unmapped += 1 # removed subspecies mapping for the time # being # if lca.subspecies != "NA": # nreads_subspecies += 1 # level_counts["subspecies"].add(lca.subspecies) # else: # c.subspecies_unmapped += 1 # if lca.subspecies_plus != "NA": # nreads_subspecies_plus += 1 # level_counts["subspecies+"].add(lca.subspecies_plus) # else: # c.subspecies_plus_unmapped += 1 options.stdout.write("\t".join([ "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+", "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+", "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom", "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass", "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily", "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies", "nseqspecies+" ]) + "\n") options.stdout.write("\t".join( map(str, [ len(level_counts["domain"]), len(level_counts["kingdom"]), len(level_counts["kingdom+"]), len(level_counts["phylum"]), len(level_counts["phylum+"]), len(level_counts["class"]), len(level_counts["class+"]), len(level_counts["order"]), len(level_counts["order+"]), len(level_counts["family"]), len(level_counts["family+"]), len(level_counts["genus"]), len(level_counts["genus+"]), len(level_counts["species"]), len(level_counts["species+"]), nreads_domain, nreads_kingdom, nreads_phylum, nreads_phylum_plus, nreads_class, nreads_class_plus, nreads_order, nreads_order_plus, nreads_family, nreads_family_plus, nreads_genus, nreads_genus_plus, nreads_species, nreads_species_plus ])) + "\n") elif options.summarise == "taxa-counts": unmapped = collections.defaultdict(int) total = 0 taxa_counts = { "domain": collections.defaultdict(int), "kingdom": collections.defaultdict(int), "kingdom+": collections.defaultdict(int), "phylum": collections.defaultdict(int), "phylum+": collections.defaultdict(int), "class": collections.defaultdict(int), "class+": collections.defaultdict(int), "order": collections.defaultdict(int), "order+": collections.defaultdict(int), "family": collections.defaultdict(int), "family+": collections.defaultdict(int), "genus": collections.defaultdict(int), "genus+": collections.defaultdict(int), "species": collections.defaultdict(int), "species+": collections.defaultdict(int) } c = E.Counter() for lca in LCA.iterate(options.stdin): total += 1 if lca.domain != "NA": taxa_counts["domain"][lca.domain] += 1 else: c.kingdom_unmapped += 1 unmapped["domain"] += 1 if lca.kingdom != "NA": taxa_counts["kingdom"][lca.kingdom] += 1 else: c.kingdom_unmapped += 1 unmapped["kingdom"] += 1 if lca.kingdom_plus != "NA": taxa_counts["kingdom+"][lca.kingdom_plus] += 1 else: c.kingdom_plus_unmapped += 1 unmapped["kingdom+"] += 1 if lca.phylum != "NA": taxa_counts["phylum"][lca.phylum] += 1 else: c.phylum_unmapped += 1 unmapped["phylum"] += 1 if lca.phylum_plus != "NA": taxa_counts["phylum+"][lca.phylum_plus] += 1 else: c.phylum_plus_unmapped += 1 unmapped["phylum+"] += 1 if lca._class != "NA": taxa_counts["class"][lca._class] += 1 else: c.class_unmapped += 1 unmapped["class"] += 1 if lca._class_plus != "NA": taxa_counts["class+"][lca._class_plus] += 1 else: c.class_plus_unmapped += 1 unmapped["class+"] += 1 if lca.order != "NA": taxa_counts["order"][lca.order] += 1 else: c.order_unmapped += 1 unmapped["order"] += 1 if lca.order_plus != "NA": taxa_counts["order+"][lca.order_plus] += 1 else: c.order_plus_unmapped += 1 unmapped["order+"] += 1 if lca.family != "NA": taxa_counts["family"][lca.family] += 1 else: c.family_unmapped += 1 unmapped["family"] += 1 if lca.family_plus != "NA": taxa_counts["family+"][lca.family_plus] += 1 else: c.family_plus_unmapped += 1 unmapped["family+"] += 1 if lca.genus != "NA": taxa_counts["genus"][lca.genus] += 1 else: c.genus_unmapped += 1 unmapped["genus"] += 1 if lca.genus_plus != "NA": taxa_counts["genus+"][lca.genus_plus] += 1 else: c.genus_plus_unmapped += 1 unmapped["genus+"] += 1 if lca.species != "NA": taxa_counts["species"][lca.species] += 1 else: c.species_unmapped += 1 unmapped["species"] += 1 if lca.species_plus != "NA": taxa_counts["species+"][lca.species_plus] += 1 else: c.species_plus_unmapped += 1 unmapped["species+"] += 1 options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n") for level, taxa_count in sorted(taxa_counts.items()): total_level = total - unmapped[level] for taxa, count in sorted(taxa_count.items()): options.stdout.write("\t".join([ level, taxa, str(count), "{:.8}".format(float(count) / total_level), "{:.8}". format(float(count) / (float(total_level) / 1000000)) ]) + "\n") E.info(c) elif options.summarise == "individual": # each read is output with its respective # taxon assignments options.stdout.write("\t".join([ "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+", "class", "class+", "order", "order+", "family", "family+", "genus", "genus+", "species", "species+" ]) + "\n") for lca in LCA.iterate(options.stdin): options.stdout.write("\t".join([ lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus, lca.phylum, lca.phylum_plus, lca._class, lca._class_plus, lca.order, lca.order_plus, lca.family, lca.family_plus, lca.genus, lca.genus_plus, lca.species, lca.species_plus ]) + "\n") # write footer and output benchmark information. E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-fastq-file", dest="input_fastq_file", type="string", help="input fastq file. " "[%default]") parser.add_option( "--output-removed-tsv", dest="output_removed_tsv", type="string", help="if given, sequence identifiers of removed sequences will " "be stored in this file [%default]") parser.add_option( "--output-stats-tsv", dest="output_stats_tsv", type="string", help="if given, output statistics will be written to this file. " "[%default]") parser.add_option("--output-removed-fastq", dest="output_removed_fastq", type="string", help="if given, removed fastq records will " "be stored in this file [%default]") parser.add_option("-m", "--method", dest="methods", action="append", type="choice", choices=("filter-N", "filter-identifier", "filter-ONT", "offset-quality"), help="methods to apply [%default]") parser.add_option("--set-prefix", dest="set_prefix", type="string", help="set sequence prefix [%default]") parser.add_option("--input-filter-tsv", dest="input_filter_tsv", type="string", help="list of sequence ides to filter [%default]") parser.add_option("--min-average-quality", dest="min_average_quality", type="float", help="minimum average quality [%default]") parser.add_option("--min-length", dest="min_length", type="int", help="minimum length [%default]") parser.add_option("--quality-offset", dest="quality_offset", type="int", help="offset to modify quality values with [%default]") parser.set_defaults( methods=[], max_percent_N=10.0, input_fastq_file=None, set_prefix=None, output_removed_tsv=None, output_removed_fastq=None, output_stats_tsv=None, input_filter_tsv=None, min_average_quality=0, min_length=0, quality_offset=0, ) (options, args) = E.start(parser, argv) if len(args) == 1: options.input_fastq_file = args[0] if options.input_fastq_file is None: raise ValueError("missing input fastq file") counter = E.Counter() filter_n = "filter-N" in options.methods filter_ont = "filter-ONT" in options.methods if "filter-identifier" in options.methods: if options.input_filter_tsv is None: raise ValueError( "please set --input-filter-tsv for method filter-identifier") with IOTools.open_file(options.input_filter_tsv) as inf: filter_identifier = set( [x.split()[0].strip() for x in inf.readlines()]) else: filter_identifier = False if options.output_removed_tsv: outf_removed_tsv = IOTools.open_file(options.output_removed_tsv, "w") else: outf_removed_tsv = None if options.output_removed_fastq: outf_removed_fastq = IOTools.open_file(options.output_removed_fastq, "w") else: outf_removed_fastq = None if options.set_prefix: prefix = "{}".format(options.set_prefix) else: prefix = None quality_offset = options.quality_offset with pysam.FastxFile(options.input_fastq_file) as inf: for read in inf: counter.input += 1 remove = False if filter_n: chars = collections.Counter(read.sequence) if "N" in chars and \ 100.0 * chars["N"] / len(read.sequence) > options.max_percent_N: remove = True counter.filter_n += 1 if filter_identifier: if read.name not in filter_identifier: counter.filter_identifier += 1 remove = True if filter_ont: quals = read.get_quality_array() n = len(quals) if n < options.min_length or \ float(sum(quals)) / n < options.min_average_quality: counter.remove_ont += 1 remove = True if remove: counter.removed += 1 if outf_removed_tsv: outf_removed_tsv.write(read.name + "\n") if outf_removed_fastq: outf_removed_fastq.write(str(read) + "\n") continue if prefix: read.name = prefix + read.name[2:] if quality_offset: quals = numpy.array(read.get_quality_array()) quals += quality_offset quals[quals < 0] = 0 quals += 33 # pysam fastq is read-only, so fudge it: # Note: not outputting description read = "@{}\n{}\n+\n{}".format( read.name, read.sequence, "".join([chr(x) for x in quals])) counter.output += 1 options.stdout.write(str(read) + "\n") if outf_removed_tsv: outf_removed_tsv.close() if outf_removed_fastq: outf_removed_fastq.close() if options.output_stats_tsv: with IOTools.open_file(options.output_stats_tsv, "w") as outf: outf.write(counter.asTable(as_rows=False) + "\n") E.info(counter) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--filename", dest="filename", type="string", help="bamfile") parser.add_option("-a", "--aligner", dest="aligner", type="string", help="bamfile", default="bwa") parser.add_option("-r", "--output-report", type="string", dest="report", help="bamfile", default="") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="bamfile", default="") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) # Check the aligner is supported if options.aligner != "bwa": raise ValueError( "Currently only bwa is supported as aligner specific flags are used" ) # Check that either a report or outfile name has been specified if options.report == "" and options.outfile == "": raise ValueError("Nothing to do") # Analyse the bamfile samfile = pysam.AlignmentFile(options.filename, "rb") uniq_map, best_map, uORb_map = {}, {}, {} properly_paired = 0 for read in samfile.fetch(): if read.is_proper_pair: tagd = dict(read.tags) u, b, key = False, False, read.qname if tagd["XT"] == "U": u = True uniq_map[key] = 1 if "X0" in tagd: if tagd["X0"] == 1: b = True best_map[key] = 1 if u is True or b is True: uORb_map[key] = 1 properly_paired += 1 samfile.close() npp = properly_paired / 2 E.info("No proper pairs: %s" % npp) # Write a tabular report if report name given if options.report != "": E.info("Writing report on no. proper pairs with unique/best reads") def _row(x, npp=npp): name, d = x n = len(list(d.keys())) pc = float(n) / npp * 100 line = "%s\t%i\t%.2f" % (name, n, pc) return (line) header = "\t".join( ["pair_criteria", "n_proper_pairs", "percent_proper_pairs"]) with IOTools.open_file(options.report, "w") as report: report.write(header + "\n") for x in [("unique", uniq_map), ("best", best_map), ("unique_or_best", uORb_map)]: report.write(_row(x) + "\n") # Create new bam containing uniquely mapping read pairs # if outfile specified if options.outfile != "": E.info("Writing proper pairs with unique or best read to %s" % options.outfile) samfile = pysam.AlignmentFile(options.filename, "rb") outbam = pysam.AlignmentFile(options.outfile, "wb", template=samfile) for read in samfile.fetch(): if read.is_proper_pair: if read.qname in uORb_map: outbam.write(read) samfile.close() outbam.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $1.0$", usage=globals()["__doc__"]) parser.add_option("-d", "--database", dest="database", type="string", help="Supply database name") parser.add_option("-u", "--indivfile", dest="indivfile", type="string", help="Supply input bed file name for individual utrons") parser.add_option("-p", "--partfile", dest="partfile", type="string", help="Supply input bed file name for partnered utrons") parser.add_option("-n", "--novelfile", dest="novelfile", type="string", help="Supply input bed file name for novel utrons") parser.add_option("-t", "--targetfile", dest="targetfile", type="string", help="Supply input bed file name for miRNA TSs") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="Supply output csv file name") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) global db db = options.database #get expressions files expressions = PUtils.fetch_DataFrame( "SELECT track, match_gene_id, transfrag_id, fpkm FROM agg_agg_agg_cuffcompare_transcripts CROSS JOIN agg_agg_agg_class WHERE transfrag_id = agg_agg_agg_class.transcript_id AND fpkm > 0", options.database) expressions = expressions.set_index( ["track", "match_gene_id", "transfrag_id"]) grouped_expression = expressions["fpkm"].groupby( level=["track", "match_gene_id"]) ex_fracts = grouped_expression.apply(lambda x: x / x.sum()) ex_fracts.to_csv("pruned_expressionfractions.csv") ex_sums = grouped_expression.apply(lambda x: x.sum()) ex_sums.to_csv("pruned_expressionsums.csv") ex_sums = pd.read_csv("pruned_expressionsums.csv", names=['track', 'match_gene_id', 'exp_sum']) ex_sums = ex_sums.set_index(['match_gene_id', 'track']) ex_fracts = pd.read_csv( "pruned_expressionfractions.csv", names=['track', 'match_gene_id', 'transfrag_id', 'exp_fract']) ex_fracts = ex_fracts.set_index(['track', 'match_gene_id', 'transfrag_id']) fpkm_ex_fracts = ex_fracts.join(expressions, how='inner') fpkm_ex_fracts = fpkm_ex_fracts.reset_index() fpkm_ex_fracts = fpkm_ex_fracts.set_index(['match_gene_id', 'track']) ex_all = fpkm_ex_fracts.join(ex_sums, how='inner') ex_all = ex_all.reset_index() ex_all.to_csv("pruned_expression_all.csv") ex_all = pd.read_csv("pruned_expression_all.csv") ex_all = ex_all.set_index('transfrag_id') #stop distances ind_utrons = pd.read_table( options.indivfile, header=0, sep='\t', names=["chrom", "start", "end", "name", "score", "strand", "stop"], usecols=["start", "end", "name", "strand", "stop"], compression='gzip') ind_utrons['dist'] = ind_utrons.apply(lambda row: getStopDistdf(row), axis=1) ind_utrons = ind_utrons.set_index('name') grouped_stopdist = ind_utrons.groupby(level='name') transcript_dist = grouped_stopdist.apply(lambda group: group['dist'].max()) transcript_dist.name = 'dist' transcript_over_under_50 = transcript_dist.apply( lambda row: getOverUnder50(row)) transcript_over_under_50.name = 'over_under_50' ex_all_dist = ex_all.join(transcript_over_under_50, how='left') ex_all_dist = ex_all_dist.join(transcript_dist, how='left') ex_all_dist['utron'] = ex_all_dist.apply(lambda row: isUtron(row), axis=1) #novel utrons novel_utrons = pd.read_table(options.novelfile, header=0, sep='\t', names=[ "chrom", "start", "end", "name", "score", "strand", "a", "b", "c", "d", "e", "f" ], usecols=["start", "end", "name"], compression='gzip') novel_utrons = novel_utrons.set_index(novel_utrons["name"]) novel_utrons = novel_utrons.drop_duplicates( subset="name" ) #excludes entries with different start/end utron coordinates in same transcript novel_utrons['novel_utron'] = novel_utrons.apply( lambda row: insertYesCol(row), axis=1) novel_utrons = novel_utrons.drop(['start', 'end', 'name'], axis=1) ex_all_dist_nov = ex_all_dist.join(novel_utrons, how='left') #TSs utron_TSs = pd.read_table( options.targetfile, header=0, sep='\t', names=["chrom", "start", "end", "name", "score", "strand", "stop"], usecols=["start", "end", "name", "strand", "stop"], compression='gzip') utron_TSs['miRNA_TS'] = utron_TSs.apply(lambda row: insertYesCol(row), axis=1) utron_TSs = utron_TSs.drop(["start", "end", "strand", "stop"], axis=1).drop_duplicates() utron_TSs = utron_TSs.set_index(["name"]) ex_all_dist_nov_TS = ex_all_dist_nov.join(utron_TSs, how='left') #extra utrons tcons_ens = pd.read_table(options.partfile, header=0, sep='\t', names=[ "chrom", "start", "end", "name", "score", "strand", "a", 'b', 'c', 'd', 'e', 'f' ], usecols=["start", "end", "name", "strand"], compression='gzip') tcons_ens['TCONS_id'] = tcons_ens.apply(lambda row: get_tcons(row), axis=1) tcons_ens['partner_id'] = tcons_ens.apply(lambda row: get_enst(row), axis=1) tcons_ens = tcons_ens.set_index('TCONS_id') tcons_ens['partner_id_TCONS'] = tcons_ens.apply( lambda row: get_tcons_from_ens(row), axis=1) tcons_ens = tcons_ens.drop_duplicates() tcons_ens['extra_utron'] = tcons_ens.apply(lambda row: insertYesCol(row), axis=1) partners = tcons_ens[['name', 'partner_id_TCONS']] partners = partners[partners['partner_id_TCONS'] != 'No_id'] partners = partners.set_index('partner_id_TCONS') utrons_and_partners = tcons_ens.append(partners) utrons_and_partners = utrons_and_partners.join(ex_all_dist_nov, how='inner') utrons_and_partners = utrons_and_partners.reset_index().drop_duplicates( subset=['match_gene_id', 'track', 'index']) utrons_and_partners = utrons_and_partners.set_index( ['match_gene_id', 'track']) groups = utrons_and_partners.groupby(level=['match_gene_id', 'track']) sums = groups.apply(lambda group: sum(group['fpkm'])) utrons_and_partners['partner_exp_sum'] = sums utrons_and_partners['partner_exp_fract'] = utrons_and_partners.apply( lambda row: row['fpkm'] / row['partner_exp_sum'], axis=1) only_utrons = utrons_and_partners[utrons_and_partners['extra_utron'] == 'Yes'] only_utrons = only_utrons[[ 'index', 'extra_utron', 'partner_exp_sum', 'partner_exp_fract', 'partner_id_TCONS', 'partner_id' ]] only_utrons = only_utrons.reset_index() only_utrons = only_utrons.dropna( subset=['match_gene_id', 'track', 'index']) only_utrons = only_utrons.set_index(['match_gene_id', 'track', 'index']) ex_all_dist_nov_TS = ex_all_dist_nov_TS.reset_index() ex_all_dist_nov_TS = ex_all_dist_nov_TS.set_index( ['match_gene_id', 'track', 'index']) ex_all_dist_nov_TS_ext = ex_all_dist_nov_TS.join(only_utrons, how='left') #patients and treatment final = ex_all_dist_nov_TS_ext.reset_index() final['treatment'] = final.apply(lambda row: label_treatment(row), axis=1) final['patient'] = final.apply(lambda row: label_patient(row), axis=1) final.to_csv(options.outfile) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--R-script", dest="scripts_r", type="string", help="PATH to location of R scripts and functions") parser.add_option("--trait1-results", dest="trait1_res", type="string", help="summary statistics for trait 1") parser.add_option("--trait2-results", dest="trait2_res", type="string", help="summary statistics for trait 2") parser.add_option("--maf-table", dest="maf_table", type="string", help="Table containing allele frequency info for " "all SNPs") parser.add_option("--maf-snp-column", dest="maf_snpcol", type="string", help="column header containing SNP IDs") parser.add_option("--trait1-snplist", dest="trait1_snplist", type="string", help="restrict the analysis to this set of SNPs " "for trait1") parser.add_option("--trait2-snplist", dest="trait2_snplist", type="string", help="restrict the analysis to this set of SNPs " "for trait2") parser.add_option("--gene-list", dest="gene_list", type="string", help="list of genes to test eQTL-trait overlap with. " "Either trait1 or trait2 must contain a GENE column.") parser.add_option("--trait1-type", dest="trait1_type", type="choice", choices=["quant", "cc"], help="Trait 1 type, either " "quantitative (quant) or binary (cc)") parser.add_option("--trait2-type", dest="trait2_type", type="choice", choices=["quant", "cc"], help="Trait 2 type, either " "quantitative (quant) or binary (cc)") parser.add_option("--trait1-size", dest="trait1_size", type="int", help="sample size for trait1 analysis, only use this " "if the NMISS column is missing") parser.add_option("--trait2-size", dest="trait2_size", type="int", help="sample size for trait2 analysis, only use this " "if the NMISS column is missing") parser.add_option("--trait1-p-column", dest="trait1_pcol", type="string", help="Column header for P-value column in trait 1 " "results file, if not `P`") parser.add_option("--trait2-p-column", dest="trait2_pcol", type="string", help="Column header for P-value column in trait 2 " "results file, if not `P`") parser.add_option("--trait1-prevalence", dest="trait1_prev", type="float", help="Prevalence of trait 1 in the population. Only " "relevant for binary traits") parser.add_option("--trait2-prevalence", dest="trait2_prev", type="float", help="Prevalence of trait 2 in the population. Only " "relevant for binary traits") parser.add_option("--chromosome", dest="chrome", type="string", help="Restrict analysis to this chromosome.") parser.add_option("--restrict-from", dest="restrict_from", type="int", help="start co-ordinate to restrict analysis. Must " "provide `--chromosome` when restricting region") parser.add_option("--restrict-to", dest="restrict_to", type="int", help="end co-ordinate to restrict analysis. Must " "provide `--chromosome` when restricting region") parser.set_defaults(chrome=None, restrict_from=None, restrict_to=None, trait1_prev=None, trait2_prev=None, trait1_pcol="P", trait2_pcol="P", maf_snpcol="SNP") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) # check all files contain necessary fields if options.trait1_res.endswith(".gz"): trait1_comp = "gzip" else: trait1_comp = None t1_nsize = False t2_nsize = False E.info("Parsing trait 1 file: {}".format(options.trait1_res)) try: trait1_peek = pd.read_table(options.trait1_res, nrows=5, sep="\s*", header=0, index_col=None, compression=trait1_comp, engine='python') try: len_cols = len( set(trait1_peek.columns).intersection( ["SNP", "NMISS", "{}".format(options.trait1_pcol)])) assert len_cols == 3 trait1_sep = "\s*" except AssertionError: if options.trait1_size: t1_nsize = True trait1_sep = "\s*" E.warn("NMISS column is not present, " "using input sample size n={}".format( options.trait1_size)) else: raise IOError("Trait-1 input file does not contain " "SNP, NMISS or P columns") except StopIteration: trait1_peek = pd.read_table(options.trait1_res, nrows=5, sep="\t", header=0, compression=trait1_comp, index_col=None) try: len_cols = len( set(trait1_peek.columns).intersection( ["SNP", "NMISS", "{}".format(options.trait1_pcol)])) assert len_cols == 3 trait1_sep = "\t" except AssertionError: if options.trait1_size: t1_nsize = True trait1_sep = "\t" E.warn("NMISS column is not present, " "using input sample size n={}".format( options.trait1_size)) else: raise IOError("Trait-1 input file does not contain " "SNP, NMISS or P columns") if options.trait2_res.endswith(".gz"): trait2_comp = "gzip" else: trait2_comp = None E.info("Parsing trait 2 file: {}".format(options.trait2_res)) try: trait2_peek = pd.read_table(options.trait2_res, nrows=5, sep="\s*", header=0, index_col=None, compression=trait2_comp, engine='python') try: len_cols = len( set(trait2_peek.columns).intersection( ["SNP", "NMISS", "{}".format(options.trait2_pcol)])) assert len_cols == 3 trait2_sep = "\s*" except AssertionError: if options.trait2_size: t2_nsize = True trait2_sep = "\s*" E.warn("NMISS column is not present, " "using input sample size n={}".format( options.trait2_size)) else: raise IOError("Trait-2 input file does not contain " "SNP, NMISS or P columns") except StopIteration: trait2_peek = pd.read_table(options.trait2_res, nrows=5, sep="\t", header=0, compression=trait2_comp, index_col=None) try: len_cols = len( set(trait2_peek.columns).intersection( ["SNP" "NMISS", "{}".format(options.trait2_pcol)])) assert len_cols == 3 trait2_sep = "\t" except AssertionError: if options.trait2_size: t2_nsize = True trait2_sep = "\t" E.warn("NMISS column is not present, " "using input sample size n={}".format( options.trait2_size)) else: raise IOError("Trait-2 input file does not contain " "SNP, NMISS or P columns") E.info("Parsing MAF table file: {}".format(options.maf_table)) if options.maf_table.endswith(".gz"): maf_comp = "gzip" else: maf_comp = None try: maf_peek = pd.read_table(options.maf_table, nrows=5, sep="\s*", header=0, index_col=None, compression=maf_comp, engine='python') except StopIteration: maf_peek = pd.read_table(options.maf_table, nrows=5, sep="\t", header=0, compression=maf_comp, index_col=None) try: len_cols = len( set(maf_peek.columns).intersection( ["{}".format(options.maf_snpcol), "MAF"])) assert len_cols == 2 maf_sep = "\s*" except AssertionError: raise IOError("Frequency table does not contain " "SNP or MAF columns") trait1_results = pd.read_table(options.trait1_res, sep=trait1_sep, header=0, compression=trait1_comp, index_col=None) trait2_results = pd.read_table(options.trait2_res, sep=trait2_sep, header=0, compression=trait2_comp, index_col=None) if options.trait1_pcol != "P": trait1_results.loc[:, "P"] = trait1_results[:, options.trait1_pcol] else: pass if options.trait2_pcol != "P": trait2_results.loc[:, "P"] = trait2_results.loc[:, options.trait2_pcol] else: pass if t1_nsize: trait1_results.loc[:, "NMISS"] = options.trait1_size else: pass if t2_nsize: trait2_results.loc[:, "NMISS"] = options.trait2_size else: pass maf_table = pd.read_table(options.maf_table, sep=maf_sep, header=0, compression=maf_comp, index_col=None) if options.maf_snpcol != "SNP": maf_table.loc[:, "SNP"] = maf_table.loc[:, options.maf_snpcol] else: pass if options.gene_list: gene_list = set() with open(options.gene_list, "r") as gfile: for gene in gfile.readlines(): gene_list.add(gene.rstrip("\n")) else: gene_list = None # restrict analysis to a specific set of SNP # good for picking just SNPs part of independent association # signals if options.trait1_snplist: t1_snplist = set() with open(options.trait1_snplist, "r") as t1_sfile: for t1snp in t1_sfile.readlines(): t1_snplist.add(t1snp.rstrip("\n")) trait1_results = trait1_results.loc[trait1_results["SNP"].isin( t1_snplist)] else: pass if options.trait2_snplist: t2_snplist = set() with open(options.trait2_snplist, "r") as t2_sfile: for t2snp in t2_sfile.readlines(): t2_snplist.add(t2snp.rstrip("\n")) trait2_results = trait2_results.loc[trait2_results["SNP"].isin( t2_snplist)] else: pass out_df = testColoc(trait1=trait1_results, trait2=trait2_results, trait1_type=options.trait1_type, trait2_type=options.trait2_type, scriptsdir=options.scripts_r, gene_list=gene_list, maf_table=maf_table, trait1_prev=options.trait1_prev, trait2_prev=options.trait2_prev, chromosome=options.chrome, start=options.restrict_from, end=options.restrict_to) out_df.to_csv(options.stdout, index_label="Trait", sep="\t") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-p", "--path", dest="path", type="string", help="path to scan for files [%default]") parser.add_option("-d", "--destination", dest="destination", type="string", help="path to deposit files into [%defaul]") parser.set_defaults(path='/ifs/projects/sftp', url='http://www.cgat.org/downloads/', dest='/ifs/projects/overview') # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) statement = "find %s -name 'index.html'" % options.path process = subprocess.Popen(statement, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() files = stdout.split('\n') files.sort() outfile = IOTools.openFile(os.path.join(options.dest, "index.html"), "w") outfile.write(''' <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <title>CGAT project reports</title> <link rel="stylesheet" href="cgat.css" type="text/css" /> <link rel="stylesheet" href="pygments.css" type="text/css" /> <link rel="shortcut icon" href="http://cgatwiki.anat.ox.ac.uk/favicon.ico"> <script type="text/javascript" src="sorttable.js"></script> </head> <body> <div class="related"> <h3>Navigation</h3> <ul> <li><a href="index.html">CGAT Projects Overview</a> »</li> </ul> </div> <div class="document"> <div class="documentwrapper"> <div class="bodywrapper"> <div class="body"> <div class="section" id="cgat-pipelines"> <H1>CGAT exported project pages</H1> <p> This page is for internal use only. Do not distribute outside of CGAT and do not make this page available on the world wide web. </p> <table class="sortable">\n''') outfile.write( '''<tr><th>Project</th><th>Report</th><th>Title</th></tr>\n''') for f in files: if f == '': continue proj = re.search('(proj\d+)', f).groups()[0] relpath = re.sub('.*proj\d+/', '', f) report = re.sub('^[^/]*/', '', os.path.dirname(relpath)) lines = IOTools.openFile(f).readlines() titles = [x for x in lines if "<title>" in x] if titles: title = re.search("<title>(.*)</title>", titles[0]).groups()[0] else: title = "NA" if title.endswith("documentation"): title = title[:-len("documentation")] url = os.path.join(options.url, relpath) outfile.write( '<tr><td>%(proj)s</td><td><a HREF="%(url)s">%(report)s</td><td>%(title)s</td></tr>\n' % locals()) outfile.write(''' </table> </div> </div> </div> </div> </div> <div class="sphinxsidebar"> <div class="sphinxsidebarwrapper"> <p class="logo"><a href="contents.html"> <img class="logo" src="cgat_logo.png" alt="Logo"/> </a></p> </body> </html>\n''') outfile.close() E.info('created output file %s' % outfile.name) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--gtf-file", dest="filename_gtf", type="string", help="filename with gene models in gtf format [%default]") parser.add_option( "-m", "--filename-mismapped", dest="filename_mismapped", type="string", help="output bam file for mismapped reads [%default]") parser.add_option( "-j", "--junctions-bed-file", dest="filename_junctions", type="string", help="bam file with reads mapped across junctions [%default]") parser.add_option( "-r", "--filename-regions", dest="filename_regions", type="string", help="filename with regions to remove in bed format [%default]") parser.add_option( "-t", "--transcripts-gtf-file", dest="filename_transcriptome", type="string", help="bam file with reads mapped against transcripts [%default]") parser.add_option( "-p", "--map-tsv-file", dest="filename_map", type="string", help="filename mapping transcript numbers (used by " "--filename-transciptome) to transcript names " "(used by --filename-gtf) [%default]") parser.add_option( "-s", "--filename-stats", dest="filename_stats", type="string", help="filename to output stats to [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option( "-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option( "-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.add_option("--output-sam", dest="output_sam", action="store_true", help="output in sam format [%default]") parser.set_defaults( filename_gtf=None, filename_mismapped=None, filename_junctions=None, filename_transcriptome=None, filename_map=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, output_sam=False, filename_table=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 1: raise ValueError("please supply one bam file") bamfile_genome = args[0] genome_samfile = pysam.AlignmentFile(bamfile_genome, "rb") if options.remove_contigs: options.remove_contigs = options.remove_contigs.split(",") if options.filename_map: E.info("reading map") id_map = IOTools.read_map( IOTools.open_file(options.filename_map), has_header=True) id_map = dict([(y, x) for x, y in id_map.items()]) else: id_map = None transcripts = {} if options.filename_gtf: E.info("indexing geneset") mapped, missed = 0, 0 for gtf in GTF.transcript_iterator( GTF.iterator(IOTools.open_file(options.filename_gtf))): gtf.sort(key=lambda x: x.start) transcript_id = gtf[0].transcript_id if id_map: try: transcript_id = id_map[transcript_id] mapped += 1 except KeyError: missed += 1 continue transcripts[transcript_id] = gtf E.info("read %i transcripts from geneset (%i mapped, %i missed)" % (len(transcripts), mapped, missed)) regions_to_remove = None if options.filename_regions: E.info("indexing regions") regions_to_remove = IndexedGenome.Simple() for bed in Bed.iterator(IOTools.open_file(options.filename_regions)): regions_to_remove.add(bed.contig, bed.start, bed.end) E.info("read %i regions" % len(regions_to_remove)) if options.filename_transcriptome: transcripts_samfile = pysam.AlignmentFile(options.filename_transcriptome, "rb") else: transcripts_samfile = None if options.output_sam: output_samfile = pysam.AlignmentFile("-", "wh", template=genome_samfile) else: output_samfile = pysam.AlignmentFile("-", "wb", template=genome_samfile) if options.filename_mismapped: if not options.force and os.path.exists(options.filename_mismapped): raise IOError("output file %s already exists" % options.filename_mismapped) output_mismapped = pysam.AlignmentFile(options.filename_mismapped, "wb", template=genome_samfile) else: output_mismapped = None if options.filename_junctions: junctions_samfile = pysam.AlignmentFile(options.filename_junctions, "rb") else: junctions_samfile = None c = bams2bam_filter(genome_samfile, output_samfile, output_mismapped, transcripts_samfile, junctions_samfile, transcripts, regions=regions_to_remove, unique=options.unique, remove_contigs=options.remove_contigs, colour_mismatches=options.colour_mismatches, ignore_mismatches=options.ignore_mismatches, ignore_transcripts=transcripts_samfile is None, ignore_junctions=junctions_samfile is None) if options.filename_stats: outf = IOTools.open_file(options.filename_stats, "w") outf.write("category\tcounts\n%s\n" % c.asTable()) outf.close() if options.filename_transcriptome: transcripts_samfile.close() genome_samfile.close() output_samfile.close() if output_mismapped: output_mismapped.close() # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_test.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="choice", help="method to use [t-test=t-test,wilcox=wilcox]", choices=("t-test", "wilcox")) parser.add_option("-1", "--infile", dest="filename_input", type="string", help="input filename with vector of values.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename with vector of values.") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.set_defaults( method="t-test", filename_input=None, header="value", ) (options, args) = E.start(parser, add_pipe_options=True) if options.filename_input: infile = IOTools.open_file(options.filename_input, "r") else: infile = sys.stdin values, errors = IOTools.ReadList(infile, map_function=float) if options.filename_input: infile.close() if errors: E.warn("errors in input: %s" % ";".join(map(str, errors))) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.filename_input2: infile = IOTools.open_file(options.filename_input2, "r") values2, errors2 = IOTools.ReadList(infile, map_function=float) infile.close() else: values2 = None stat = Stats.Summary(values) power, diff_at_power95 = None, None if options.method == "t-test": if values2: result = R.t_test(values, values2, *xargs, **kwargs) else: result = R.t_test(values, *xargs, **kwargs) # compute power of test power = R.power_t_test(n=len(values), delta=abs(stat["mean"]), sd=stat["stddev"], sig_level=0.05)['power'] diff_at_power95 = R.power_t_test(n=len(values), power=0.95, sd=stat["stddev"], sig_level=0.05)['delta'] if options.method == "wilcox": result = R.wilcox_test(values, *xargs, **kwargs) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key, value in sorted(result.items()): if key == "data.name": continue if key == "p.value": options.stdout.write("%s\t%5.2e\n" % (str(key), value)) else: options.stdout.write("%s\t%s\n" % (str(key), str(value))) for key, value in list(stat.items()): options.stdout.write("%s\t%s\n" % (str(key), str(value))) if power: options.stdout.write("1-power\t%5.2e\n" % (1.0 - power)) options.stdout.write("diff_at_power95\t%f\n" % diff_at_power95) E.stop()