def buildContigSizes(infile, outfile): ''' Get contig sizes from indexed genome :term:`fasta` files and outputs to a text file. Parameters ---------- infile : str infile is constructed from the `PARAMS` variable to retrieve the `genome` :term:`fasta` file Returns ------- outfile : str outfile is a text format file that contains two columns, matched contig name and contig size (in nucleotides). The output file name is defined in `PARAMS: interface_contigs`. ''' prefix = P.snip(infile, ".fasta") fasta = IndexedFasta.IndexedFasta(prefix) contigs = [] for contig, size in fasta.getContigSizes(with_synonyms=False).items(): contigs.append([contig, size]) df_contig = pd.DataFrame(contigs, columns=['contigs', 'size']) df_contig.sort_values('contigs', inplace=True) df_contig.to_csv(outfile, sep="\t", header=False, index=False)
def writeContigSizes(genome, outfile): '''write contig sizes to outfile for UCSC tools. ''' outf = iotools.openFile(outfile, "w") fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], genome)) for contig, size in fasta.getContigSizes(with_synonyms=False).items(): outf.write("%s\t%i\n" % (contig, size)) outf.close()
def get_contig_cluster(infile, outfile): """This will generate a contig file of the cluster genome""" fasta = IndexedFasta.IndexedFasta(infile) contigs = [] for contig, size in fasta.getContigSizes(with_synonyms=False).items(): contigs.append([contig, size]) df_contig = pd.DataFrame(contigs, columns=['contigs', 'size']) df_contig.sort_values('contigs', inplace=True) df_contig.to_csv(outfile, sep="\t", header=False, index=False)
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) tmpdir = P.get_temp_dir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.as_list(P.get_params()['motifs_masker']), halfwidth=int(P.get_params()["meme_halfwidth"]), maxsize=int(P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"], min_sequences=P.get_params()["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) iotools.touch_file(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"): '''export sequences for intervals in :term:`bed`-formatted *infile* to :term:`fasta` formatted *outfile* ''' track = P.snip(infile, ".bed.gz") fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) outs = iotools.open_file(outfile, "w") ids, seqs = [], [] for bed in Bed.setName(Bed.iterator(iotools.open_file(infile))): lcontig = fasta.getLength(bed.contig) if mode == "intervals": seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end)) ids.append("%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end)) elif mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end)) seqs.append(fasta.getSequence(bed.contig, "+", start, end)) masked = maskSequences(seqs, masker) outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)])) outs.close()
def buildContigBed(infile, outfile): ''' Gets the contig sizes and co-ordinates from an indexed genome :term:`fasta` file and outputs them to :term:`BED` format Parameters ---------- infile : str infile is constructed from `PARAMS` variable to retrieve the `genome` :term:`fasta` file Returns ------- outfile : str :term:`BED` format file containing contig name, value (0) and contig size in nucleotides. The output file name is defined in `PARAMS: interface_contigs_bed` ''' prefix = P.snip(infile, ".fasta") fasta = IndexedFasta.IndexedFasta(prefix) outs = iotools.open_file(outfile, "w") for contig, size in fasta.getContigSizes(with_synonyms=False).items(): outs.write("%s\t%i\t%i\n" % (contig, 0, size)) outs.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome (indexed).") parser.add_argument("-w", "--windows-bed-file", dest="filename_windows", type=str, help="gff file with windows to use.") parser.add_argument("-d", "--filename-data", dest="filename_data", type=str, help="gff file with data to use.") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="filename-data is gtf file") parser.add_argument("-f", "--features", dest="features", type=str, action="append", choices=("GC", ), help="features to compute.") parser.add_argument("-c", "--decorator", dest="decorator", type=str, choices=("counts", "gc", "gc3", "mean-length", "median-length", "percent-coverage", "median-score", "mean-score", "stddev-score", "min-score", "max-score"), help="decorators to use.") parser.add_argument("-e", "--skip-empty", dest="skip_empty", action="store_true", help="skip empty windows.") parser.add_argument( "-t", "--transform=", dest="transform", type=str, choices=("none", "overlap", "complement", "third_codon"), help="transform to use when mapping overlapping regions onto window.") parser.set_defaults( genome_file=None, filename_windows=None, filename_data=None, features=[], skip_empty=False, decorator="counts", transform="none", is_gtf=False, ) (args) = E.start(parser) # test_transform_third_codon() if not args.filename_windows: raise ValueError("please supply a gff file with window information.") if args.loglevel >= 1: args.stdlog.write("# reading windows...") args.stdlog.flush() windows = GTF.readAsIntervals( GTF.iterator(iotools.open_file(args.filename_windows, "r"))) if args.loglevel >= 1: args.stdlog.write("done\n") args.stdlog.flush() if args.filename_data: if args.loglevel >= 1: args.stdlog.write("# reading data...") args.stdlog.flush() if args.is_gtf: gff_data = GTF.readFromFile( iotools.open_file(args.filename_data, "r")) else: gff_data = GTF.readFromFile( IOTOols.open_file(args.filename_data, "r")) if args.loglevel >= 1: args.stdlog.write("done\n") args.stdlog.flush() data_ranges = GTF.SortPerContig(gff_data) else: # use windows to compute properties # by supplying no data and asking for the complement = original window gff_data = None data_ranges = None args.transform = "complement" map_contig2size = {} if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) map_contig2size = fasta.getContigSizes() else: for contig, values in list(windows.items()): map_contig2size[contig] = max(lambda x: x[1], values) fasta = None contigs = list(map_contig2size.keys()) contigs.sort() # proceed contig wise noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0 args.stdout.write("\t".join( map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1", "l1", "n2", "l2", "score", "extra_info"))) + "\n") for contig in contigs: skip = False if contig not in windows: ncontigs_skipped_windows += 1 skip = True if data_ranges and contig not in data_ranges: ncontigs_skipped_data += 1 skip = True if skip: continue noutput_contigs += 1 if data_ranges: annotateWindows( contig, windows[contig], gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta, args) else: annotateWindows(contig, windows[contig], [], fasta, args) E.info( "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i" % (len(windows), noutput_contigs, len(contigs), ncontigs_skipped_windows, ncontigs_skipped_data)) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic sequence to retrieve " "sequences from.") parser.add_option("-m", "--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker to mask output sequences " "[%default].") parser.add_option("--output-mode", dest="output_mode", type="choice", choices=("intervals", "leftright", "segments"), help="what to output. " "'intervals' generates a single sequence for " "each bed interval. 'leftright' generates two " "sequences, one in each direction, for each bed " "interval. 'segments' can be used to output " "sequence from bed12 files so that sequence only covers " "the segements [%default]") parser.add_option("--min-sequence-length", dest="min_length", type="int", help="require a minimum sequence length [%default]") parser.add_option("--max-sequence-length", dest="max_length", type="int", help="require a maximum sequence length [%default]") parser.add_option( "--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at 3', 5' or both or no ends. If 3only or 5only " "are set, only the added sequence is returned [default=%default]") parser.add_option( "--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option( "--use-strand", dest="ignore_strand", action="store_false", help="use strand information and return reverse complement " "on intervals located on the negative strand. " "[default=%default]") parser.set_defaults( genome_file=None, masker=None, output_mode="intervals", min_length=0, max_length=0, extend_at=None, extend_by=100, ignore_strand=True, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() fasta.setConverter(IndexedFasta.getConverter("zero-both-open")) counter = E.Counter() ids, seqs = [], [] E.info("collecting sequences") for bed in Bed.setName(Bed.iterator(options.stdin)): counter.input += 1 lcontig = fasta.getLength(bed.contig) if options.ignore_strand: strand = "+" else: strand = bed.strand if options.output_mode == "segments" and bed.columns == 12: ids.append("%s %s:%i..%i (%s) %s %s" % (bed.name, bed.contig, bed.start, bed.end, strand, bed["blockSizes"], bed["blockStarts"])) seg_seqs = [fasta.getSequence(bed.contig, strand, start, end) for start, end in bed.toIntervals()] seqs.append("".join(seg_seqs)) elif (options.output_mode == "intervals" or options.output_mode == "segments"): ids.append("%s %s:%i..%i (%s)" % (bed.name, bed.contig, bed.start, bed.end, strand)) seqs.append( fasta.getSequence(bed.contig, strand, bed.start, bed.end)) elif options.output_mode == "leftright": l = bed.end - bed.start start, end = max(0, bed.start - l), bed.end - l ids.append("%s_l %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) start, end = bed.start + l, min(lcontig, bed.end + l) ids.append("%s_r %s:%i..%i (%s)" % (bed.name, bed.contig, start, end, strand)) seqs.append(fasta.getSequence(bed.contig, strand, start, end)) E.info("collected %i sequences" % len(seqs)) masked = Masker.maskSequences(seqs, options.masker) options.stdout.write( "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n") E.info("masked %i sequences" % len(seqs)) counter.output = len(seqs) E.info("%s" % counter) E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) # IMS: new method: extend intervals by set amount parser.add_argument("-m", "--method", dest="methods", type=str, action="append", choices=("merge", "filter-genome", "bins", "block", "sanitize-genome", "shift", "extend", "filter-names", "rename-chr"), help="method to apply") parser.add_argument("--num-bins", dest="num_bins", type=int, help="number of bins into which to merge (used for " "method `bins)") parser.add_argument("--bin-edges", dest="bin_edges", type=str, help="bin_edges for binning method") parser.add_argument( "--binning-method", dest="binning_method", type=str, choices=("equal-bases", "equal-intervals", "equal-range"), help="method used for binning (used for method `bins` if no " "bin_edges is given)") parser.add_argument( "--merge-distance", dest="merge_distance", type=int, help="distance in bases over which to merge that are not " "directly adjacent") parser.add_argument( "--merge-min-intervals", dest="merge_min_intervals", type=int, help="only output merged intervals that are build from at least " "x intervals") parser.add_argument("--merge-by-name", dest="merge_by_name", action="store_true", help="only merge intervals with the same name") parser.add_argument( "--merge-and-resolve-blocks", dest="resolve_blocks", action="store_true", help="When merging bed12 entrys, should blocks be resolved?") parser.add_argument("--merge-stranded", dest="stranded", action="store_true", help="Only merge intervals on the same strand") parser.add_argument( "--remove-inconsistent-names", dest="remove_inconsistent_names", action="store_true", help="when merging, do not output intervals where the names of " "overlapping intervals do not match") parser.add_argument("--offset", dest="offset", type=int, help="offset for shifting intervals") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-b", "--bam-file", dest="bam_file", type=str, help="bam-formatted filename with genome.") parser.add_argument("--filter-names-file", dest="names", type=str, help="list of names to keep. One per line") parser.add_argument( "--rename-chr-file", dest="rename_chr_file", type=str, help="mapping table between old and new chromosome names." "TAB separated 2-column file.") parser.set_defaults(methods=[], merge_distance=0, binning_method="equal-bases", merge_by_name=False, genome_file=None, rename_chr_file=None, bam_file=None, num_bins=5, merge_min_intervals=1, bin_edges=None, offset=10000, test=None, extend_distance=1000, remove_inconsistent_names=False, resolve_blocks=False) (args) = E.start(parser, add_pipe_options=True) contigs = None chr_map = None # Why provide full indexed genome, when a tsv of contig sizes would do? if args.genome_file: genome_fasta = IndexedFasta.IndexedFasta(args.genome_file) contigs = genome_fasta.getContigSizes() if args.bam_file: samfile = pysam.AlignmentFile(args.bam_file) contigs = dict(list(zip(samfile.references, samfile.lengths))) if args.rename_chr_file: chr_map = {} with open(args.rename_chr_file, 'r') as filein: reader = csv.reader(filein, delimiter='\t') for row in reader: if len(row) != 2: raise ValueError( "Mapping table must have exactly two columns") chr_map[row[0]] = row[1] if not len(chr_map.keys()) > 0: raise ValueError("Empty mapping dictionnary") processor = Bed.iterator(args.stdin) for method in args.methods: if method == "filter-genome": if not contigs: raise ValueError("please supply contig sizes") processor = filterGenome(processor, contigs) elif method == "sanitize-genome": if not contigs: raise ValueError("please supply contig sizes") processor = sanitizeGenome(processor, contigs) elif method == "merge": processor = merge( processor, args.merge_distance, by_name=args.merge_by_name, min_intervals=args.merge_min_intervals, remove_inconsistent=args.remove_inconsistent_names, resolve_blocks=args.resolve_blocks, stranded=args.stranded) elif method == "bins": if args.bin_edges: bin_edges = list(map(float, args.bin_edges.split(","))) # IMS: check bin edges are valid if not (len(bin_edges) == args.num_bins + 1): raise ValueError( "Number of bin edge must be one more than " "number of bins") else: bin_edges = None processor, bin_edges = Bed.binIntervals(processor, num_bins=args.num_bins, method=args.binning_method, bin_edges=bin_edges) E.info("# split bed: bin_edges=%s" % (str(bin_edges))) elif method == "block": processor = Bed.blocked_iterator(processor) elif method == "shift": # IMS: test that contig sizes are availible if not contigs: raise ValueError("please supply genome file") processor = shiftIntervals(processor, contigs, offset=args.offset) # IMS: new method: extend intervals by set amount elif method == "extend": if not contigs: raise ValueError("please supply genome file") processor = extendInterval(processor, contigs, args.offset) elif method == "filter-names": if not args.names: raise ValueError("please supply list of names to filter") names = [name.strip() for name in open(args.names)] processor = filterNames(processor, names) elif method == "rename-chr": if not chr_map: raise ValueError("please supply mapping file") processor = renameChromosomes(processor, chr_map) noutput = 0 for bed in processor: args.stdout.write(str(bed) + "\n") noutput += 1 E.info("noutput=%i" % (noutput)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.add_option("--no-header", dest="with_header", action="store_false", help="do not output BLAT header [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string", help="fasta filename with queries [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default].""" ) parser.set_defaults(is_gtf=False, genome_file=None, with_header=True, allow_duplicates=False, test=None) (options, args) = E.start(parser, add_pipe_options=True) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) else: genome_fasta = None if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None ninput, noutput, nskipped = 0, 0, 0 if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin), feature="exon"), strict=not options.allow_duplicates) else: iterator = GTF.joined_iterator(GTF.iterator(sys.stdin)) if options.with_header: options.stdout.write(Blat.Match().getHeader() + "\n") for gffs in iterator: if options.test and ninput >= options.test: break ninput += 1 result = alignlib_lite.py_makeAlignmentBlocks() xstart = 0 intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs]) for start, end in intervals: xend = xstart + end - start result.addDiagonal(xstart, xend, start - xstart) xstart = xend entry = Blat.Match() entry.mQueryId = gffs[0].transcript_id entry.mSbjctId = gffs[0].contig entry.strand = gffs[0].strand if genome_fasta: if entry.mSbjctId in genome_fasta: entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId) else: entry.mSbjctLength = result.getColTo() if queries_fasta: if entry.mQueryId in queries_fasta: entry.mQueryLength = queries_fasta.getLength(entry.mQueryId) else: entry.mQueryLength = result.getRowTo() entry.fromMap(result) options.stdout.write(str(entry) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-q", "--quality-file", dest="quality_file", type=str, help="filename with genomic base quality " "information.") parser.add_argument("-b", "--bam-file", dest="bam_files", type=str, metavar="bam", help="filename with read mapping information. " "Multiple files can be submitted in a " "comma-separated list.") parser.add_argument("-i", "--bigwig-file", dest="bigwig_file", type=str, metavar="bigwig", help="filename with bigwig information ") parser.add_argument("-f", "--gff-file", dest="filename_gff", type=str, action="append", metavar='bed', help="filename with extra gff files. The order " "is important.") parser.add_argument("--filename-format", dest="filename_format", type=str, choices=("bed", "gff", "gtf"), help="format of secondary stream.") parser.add_argument("--restrict-source", dest="gff_sources", type=str, action="append", help="restrict input to this 'source' in extra " "gff file (for counter: overlap).") parser.add_argument("--restrict-feature", dest="gff_features", type=str, action="append", help="restrict input to this 'feature' in extra gff " "file (for counter: overlap).") parser.add_argument("-r", "--reporter", dest="reporter", type=str, choices=("genes", "transcripts"), help="report results for 'genes' or 'transcripts' ") parser.add_argument("-s", "--section", dest="sections", type=str, action="append", choices=("exons", "introns"), help="select range on which counters will operate ") parser.add_argument( "-c", "--counter", dest="counters", type=str, action="append", choices=("bigwig-counts", "binding-pattern", "classifier", "classifier-rnaseq", "classifier-rnaseq-splicing", "classifier-polii", "composition-na", "composition-cpg", "coverage", "distance", "distance-genes", "distance-tss", "length", 'neighbours', "overlap", "overlap-stranded", "overlap-transcripts", "overrun", "position", "proximity", "proximity-exclusive", "proximity-lengthmatched", "quality", "read-coverage", "read-extension", "read-overlap", "read-counts", "read-fullcounts", "readpair-counts", "readpair-fullcounts", "splice", "splice-comparison", "territories"), help="select counters to apply to input ") parser.add_argument("--add-gtf-source", dest="add_gtf_source", action="store_true", help="add gtf field of source to output ") parser.add_argument("--proximal-distance", dest="proximal_distance", type=int, help="distance to be considered proximal to " "an interval.") parser.add_argument("--multi-mapping-method", dest="multi_mapping", type=str, choices=('all', 'ignore', 'weight'), help="how to treat multi-mapping reads in " "bam-files. Requires " "the NH flag to be set by the mapper ") parser.add_argument("--use-barcodes", dest="use_barcodes", action="store_true", help="Use barcodes to count unique umi's. " "UMI's are specified in the read identifier " "as the last field, where fields are separated " "by underscores, e.g. " "@READ:ILLUMINA:STUFF_NAMINGSTUFF_UMI. " "When true, unique counts are returned. " "Currently only compatible with count-reads") parser.add_argument("--sample-probability", dest="sample_probability", type=float, help="Specify the probability of whether any" "given read or read pair in a file bam is counted" "Currently only compatible with count-reads") parser.add_argument("--column-prefix", dest="prefixes", type=str, action="append", help="add prefix to column headers - prefixes " "are used in the same order as the counters ") parser.add_argument("--library-type", dest="library_type", type=str, choices=("unstranded", "firststrand", "secondstrand", "fr-unstranded", "fr-firststrand", "fr-secondstrand"), help="library type of reads in bam file. ") parser.add_argument("--min-mapping-quality", dest="minimum_mapping_quality", type=float, help="minimum mapping quality. Reads with a quality " "score of less will be ignored. ") parser.set_defaults(genome_file=None, reporter="genes", with_values=True, sections=[], counters=[], filename_gff=[], filename_format=None, gff_features=[], gff_sources=[], add_gtf_source=False, proximal_distance=10000, bam_files=None, multi_mapping='all', library_type='fr-unstranded', prefixes=[], minimum_mapping_quality=0, use_barcodes=False, sample_probability=1.0) if not argv: argv = sys.argv (args) = E.start(parser, add_output_options=True, argv=argv) if args.prefixes: if len(args.prefixes) != len(args.counters): raise ValueError("if any prefix is given, the number of prefixes " "must be the same as the number of counters") # get files if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: fasta = None if args.quality_file: quality = IndexedFasta.IndexedFasta(args.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) else: quality = None if args.bam_files: bam_files = [] for bamfile in args.bam_files.split(","): bam_files.append(pysam.AlignmentFile(bamfile, "rb")) else: bam_files = None if args.bigwig_file: bigwig_file = pyBigWig.open(args.bigwig_file) else: bigwig_file = None counters = [] if not args.sections: E.info("counters will use the default section (exons)") args.sections.append(None) if not args.gff_sources: args.gff_sources.append(None) if not args.gff_features: args.gff_features.append(None) cc = E.Counter() for n, c in enumerate(args.counters): if args.prefixes: prefix = args.prefixes[n] else: prefix = None if c == "position": for section in args.sections: counters.append( GeneModelAnalysis.CounterPosition(section=section, options=args, prefix=prefix)) elif c == "length": for section in args.sections: counters.append( GeneModelAnalysis.CounterLengths(section=section, options=args, prefix=prefix)) elif c == "splice": if fasta is None: raise ValueError('splice requires a genomic sequence') counters.append( GeneModelAnalysis.CounterSpliceSites(fasta=fasta, prefix=prefix)) elif c == "quality": if fasta is None: raise ValueError('quality requires a quality score sequence') counters.append( GeneModelAnalysis.CounterQuality(fasta=quality, prefix=prefix)) elif c == "overrun": counters.append( GeneModelAnalysis.CounterOverrun( filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-coverage": counters.append( GeneModelAnalysis.CounterReadCoverage(bam_files, options=args, prefix=prefix)) elif c == "read-extension": counters.append( GeneModelAnalysis.CounterReadExtension( bam_files, filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-overlap": counters.append( GeneModelAnalysis.CounterReadOverlap( bam_files, multi_mapping=args.multi_mapping, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-counts": counters.append( GeneModelAnalysis.CounterReadCounts( bam_files, multi_mapping=args.multi_mapping, use_barcodes=args.use_barcodes, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-fullcounts": counters.append( GeneModelAnalysis.CounterReadCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-counts": counters.append( GeneModelAnalysis.CounterReadPairCounts( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, library_type=args.library_type, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-fullcounts": counters.append( GeneModelAnalysis.CounterReadPairCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "bigwig-counts": counters.append( GeneModelAnalysis.CounterBigwigCounts(bigwig_file, options=args, prefix=prefix)) elif c == "splice-comparison": if fasta is None: raise ValueError('splice-comparison requires a genomic ' 'sequence') counters.append( GeneModelAnalysis.CounterSpliceSiteComparison( fasta=fasta, filename_gff=args.filename_gff, feature=None, source=None, options=args, prefix=prefix)) elif c == "composition-na": if fasta is None: raise ValueError('composition-na requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionNucleotides( fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "composition-cpg": if fasta is None: raise ValueError('composition-cpg requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionCpG(fasta=fasta, section=section, options=args, prefix=prefix)) elif c in ("overlap", "overlap-stranded", "overlap-transcripts", "proximity", "proximity-exclusive", "proximity-lengthmatched", "neighbours", "territories", "distance", "distance-genes", "distance-tss", "binding-pattern", "coverage"): if c == "overlap": template = GeneModelAnalysis.CounterOverlap if c == "overlap-stranded": template = GeneModelAnalysis.CounterOverlapStranded elif c == "overlap-transcripts": template = GeneModelAnalysis.CounterOverlapTranscripts elif c == "proximity": template = GeneModelAnalysis.CounterProximity elif c == "neighbours": template = GeneModelAnalysis.CounterNeighbours elif c == "proximity-exclusive": template = GeneModelAnalysis.CounterProximityExclusive elif c == "proximity-lengthmatched": template = GeneModelAnalysis.CounterProximityLengthMatched elif c == "territories": template = GeneModelAnalysis.CounterTerritories elif c == "distance": template = GeneModelAnalysis.CounterDistance elif c == "distance-genes": template = GeneModelAnalysis.CounterDistanceGenes elif c == "distance-tss": template = GeneModelAnalysis.CounterDistanceTranscriptionStartSites elif c == "coverage": template = GeneModelAnalysis.CounterCoverage elif c == "binding-pattern": template = GeneModelAnalysis.CounterBindingPattern for section in args.sections: for source in args.gff_sources: for feature in args.gff_features: counters.append( template(filename_gff=args.filename_gff, feature=feature, source=source, fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "classifier": counters.append( GeneModelAnalysis.Classifier(filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq": counters.append( GeneModelAnalysis.ClassifierRNASeq( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq-splicing": counters.append( GeneModelAnalysis.ClassifierRNASeqSplicing( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-polii": counters.append( GeneModelAnalysis.ClassifierPolII( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) elif c == "binding-pattern": counters.append( GeneModelAnalysis.CounterBindingPattern( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) if args.reporter == "genes": iterator = GTF.flat_gene_iterator header = ["gene_id"] fheader = lambda x: [x[0].gene_id] elif args.reporter == "transcripts": iterator = GTF.transcript_iterator header = ["transcript_id"] fheader = lambda x: [x[0].transcript_id] if args.add_gtf_source: header.append("source") ffields = lambda x: [x[0].source] else: ffields = lambda x: [] args.stdout.write("\t".join(header + [x.getHeader() for x in counters]) + "\n") for gffs in iterator(GTF.iterator(args.stdin)): cc.input += 1 for counter in counters: counter.update(gffs) skip = len([x for x in counters if x.skip]) == len(counters) if skip: cc.skipped += 1 continue args.stdout.write("\t".join( fheader(gffs) + ffields(gffs) + [str(counter) for counter in counters]) + "\n") cc.output += 1 E.info("%s" % str(cc)) for counter in counters: E.info("%s\t%s" % (repr(counter), str(counter.counter))) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument( "-b", "--bam-file", dest="bam_files", type=str, help="filename with read mapping information. Multiple files can be " "submitted in a comma-separated list.") parser.add_argument( "--control-bam-file", dest="control_bam_files", type=str, help="filename with read mapping information for input/control. " "Multiple files can be submitted in a comma-separated list ") parser.add_argument("--filename-format", dest="filename_format", type=str, choices=("bed", "gff", "gtf"), help="format of secondary stream.") parser.add_argument("-c", "--counter", dest="counters", type=str, action="append", choices=("length", "overlap", "peaks", "composition-na", "composition-cpg", "classifier-chipseq", "motif"), help="select counters to apply.") parser.add_argument("--motif-sequence", dest="motif_sequence", type=str, help="specify a sequence to search for") parser.add_argument( "-o", "--offset", dest="offsets", type=int, action="append", help="tag offsets for tag counting - supply as many as there " "are bam-files") parser.add_argument( "--control-offset", dest="control_offsets", type=int, action="append", help="control tag offsets for tag counting - supply as many as " "there are bam-files.") parser.add_argument( "-a", "--output-all-fields", dest="all_fields", action="store_true", help="output all fields in original bed file, by default only " "the first 4 are output.") parser.add_argument( "--output-bed-headers", dest="bed_headers", type=str, help="supply ',' separated list of headers for bed component ") parser.add_argument( "-f", "--gff-file", dest="filename_gff", type=str, action="append", metavar='bed', help="filename with extra gff files. The order is important") parser.add_argument( "--has-header", dest="has_header", action="store_true", help="bed file with headers. Headers and first columns are " "preserved ") parser.set_defaults(genome_file=None, counters=[], bam_files=None, offsets=[], control_bam_files=None, control_offsets=[], all_fields=False, filename_format=None, bed_headers=None, filename_gff=[], has_header=False, motif_sequence=None) (args) = E.start(parser) if args.bed_headers is not None: bed_headers = [x.strip() for x in args.bed_headers.split(",")] if len(bed_headers) < 3: raise ValueError("a bed file needs at least three columns") else: bed_headers = None if args.has_header: while 1: line = args.stdin.readline() if not line: E.warn("empty bed file with no header") E.stop() return if not line.startswith("#"): break bed_headers = line[:-1].split("\t") if "motif" in args.counters and not args.motif_sequence: raise ValueError("if using motif must specify a motif-sequence") # get files if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: fasta = None if args.bam_files: bam_files = [] for bamfile in args.bam_files.split(","): bam_files.append(pysam.AlignmentFile(bamfile, "rb")) else: bam_files = None if args.control_bam_files: control_bam_files = [] for bamfile in args.control_bam_files.split(","): control_bam_files.append(pysam.AlignmentFile(bamfile, "rb")) else: control_bam_files = None counters = [] for c in args.counters: if c == "length": counters.append(CounterLength(fasta=fasta, options=args)) elif c == "overlap": counters.append( CounterOverlap(filename=args.filename_gff[0], fasta=fasta, options=args)) del args.filename_gff[0] elif c == "peaks": counters.append( CounterPeaks(bam_files, args.offsets, control_bam_files, args.control_offsets, options=args)) elif c == "composition-na": counters.append( CounterCompositionNucleotides(fasta=fasta, options=args)) elif c == "composition-cpg": counters.append(CounterCompositionCpG(fasta=fasta, options=args)) elif c == "classifier-chipseq": counters.append( ClassifierChIPSeq(filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=None)) del args.filename_gff[0] elif c == "motif": counters.append( CounterMotif(fasta=fasta, motif=args.motif_sequence)) extra_fields = None for bed in Bed.iterator(args.stdin): if extra_fields is None: # output explicitely given headers if bed_headers: if len(bed_headers) > bed.columns: raise ValueError( "insufficient columns (%i, expected %i) in %s" % (bed.columns, len(bed_headers), str(bed))) else: bed_headers = Bed.Headers[:bed.columns] args.stdout.write("\t".join(bed_headers)) args.stdout.write("\t" + "\t".join([x.getHeader() for x in counters]) + "\n") extra_fields = list(range(len(bed_headers) - 3)) for counter in counters: counter.update(bed) if args.all_fields: args.stdout.write(str(bed)) else: args.stdout.write("\t".join( [bed.contig, str(bed.start), str(bed.end)] + [bed.fields[x] for x in extra_fields])) for counter in counters: args.stdout.write("\t%s" % str(counter)) args.stdout.write("\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: " "$Id: gff2coverage.py 2781 2009-09-10 11:33:14Z " "andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default]") parser.add_option("-f", "--features", dest="features", type="string", action="append", help="features to collect " "[default=%default]") parser.add_option("-w", "--window-size", dest="window_size", type="int", help="window size in bp for histogram computation. " "Determines the bin size. " "[default=%default]") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="number of bins for histogram computation " "if window size is not given. " "[default=%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=( "genomic", "histogram", ), help="methods to apply. " "[default=%default]") parser.set_defaults( genome_file=None, window_size=None, num_bins=1000, value_format="%6.4f", features=[], method="genomic", ) (options, args) = E.start(parser, add_output_options=True) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.method == "histogram": gff = GTF.readFromFile(options.stdin) gff.sort(key=lambda x: (x.contig, x.start)) chunk = [] last_contig = None for entry in gff: if last_contig != entry.contig: processChunk(last_contig, chunk, options, fasta) last_contig = entry.contig chunk = [] chunk.append(entry) processChunk(last_contig, chunk, options, fasta) elif options.method == "genomic": intervals = collections.defaultdict(int) bases = collections.defaultdict(int) total = 0 for entry in GTF.iterator(options.stdin): intervals[(entry.contig, entry.source, entry.feature)] += 1 bases[(entry.contig, entry.source, entry.feature)] += entry.end - entry.start total += entry.end - entry.start options.stdout.write("contig\tsource\tfeature\tintervals\tbases") if fasta: options.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n") else: options.stdout.write("\n") total_genome_size = sum( fasta.getContigSizes(with_synonyms=False).values()) for key in sorted(intervals.keys()): nbases = bases[key] nintervals = intervals[key] contig, source, feature = key options.stdout.write("\t".join( ("\t".join(key), str(nintervals), str(nbases)))) if fasta: options.stdout.write( "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig))) options.stdout.write( "\t%f\n" % (100.0 * float(nbases) / total_genome_size)) else: options.stdout.write("\n") E.stop()
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" else: raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.tablequote(track) statement = '''SELECT contig, start, end, interval_id, peakcenter FROM %(tablename)s ''' % locals() + orderby cc = dbhandle.execute(statement) data = cc.fetchall() cc.close() if proportion: cutoff = int(len(data) * proportion) + 1 if min_sequences: cutoff = max(cutoff, min_sequences) elif num_sequences: cutoff = num_sequences else: cutoff = len(data) L.info( "writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (track, cutoff)) data = data[:cutoff] L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) # modify the ranges if shift: if shift == "leftright": new_data = [(contig, start - (end - start), start, str(interval_id) + "_left", peakcenter) for contig, start, end, interval_id, peakcenter in data ] new_data.extend([ (contig, end, end + (end - start), str(interval_id) + "_right", peakcenter) for contig, start, end, interval_id, peakcenter in data ]) data = new_data if halfwidth: # center around peakcenter, add halfwidth on either side data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id) for contig, start, end, interval_id, peakcenter in data] else: # remove peakcenter data = [(contig, start, end, interval_id) for contig, start, end, interval_id, peakcenter in data] # get the sequences - cut at number of nucleotides sequences = [] current_size, nseq = 0, 0 new_data = [] for contig, start, end, interval_id in data: lcontig = fasta.getLength(contig) start, end = max(0, start + offset), min(end + offset, lcontig) if start >= end: L.info( "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset)) continue seq = fasta.getSequence(contig, "+", start, end) sequences.append(seq) new_data.append((start, end, interval_id, contig)) current_size += len(seq) if maxsize and current_size >= maxsize: L.info( "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % (track, maxsize, nseq, len(data) - nseq)) break nseq += 1 data = new_data if shuffled: # note that shuffling is done on the unmasked sequences # Otherwise N's would be interspersed with real sequence # messing up motif finding unfairly. Instead, masking is # done on the shuffled sequence. sequences = [list(x) for x in sequences] for sequence in sequences: random.shuffle(sequence) sequences = maskSequences(["".join(x) for x in sequences], masker) c = E.Counter() outs = iotools.open_file(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): sequences = maskSequences(sequences, masker) for sequence, d in zip(sequences, data): c.input += 1 if len(sequence) == 0: c.empty += 1 continue start, end, id, contig = d id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end) outs.write(">%s\n%s\n" % (id, sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome") parser.add_argument( "-i", "--ignore-missing", dest="ignore_missing", action="store_true", help="Ignore transcripts on contigs that are not in the genome-file.") parser.add_argument( "--min-intron-length", dest="min_intron_length", type=int, help= "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown" ) parser.add_argument("-m", "--method", dest="method", type=str, choices=["full"], help="method to apply") parser.set_defaults( genome_file=None, flank=1000, max_frameshift_length=4, min_intron_length=30, ignore_missing=False, restrict_source=None, method="full", report_step=1000, ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) if not args.genome_file: raise ValueError("an indexed genome is required.") fasta = IndexedFasta.IndexedFasta(args.genome_file) iterator = GTF.transcript_iterator(GTF.iterator(args.stdin)) annotateGenome(iterator, fasta, args) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-f", "--features", dest="features", type=str, help="feature to collect.") parser.add_argument("-i", "--files", dest="files", action="append", help="use multiple annotations.") parser.add_argument( "-a", "--annotations", dest="annotations", type=str, help= "aggregate name for annotations if only single file is provided from STDIN." ) parser.add_argument("--map-tsv-file", dest="input_filename_map", type=str, help="filename with a map of gene_ids to categories.") parser.add_argument("-l", "--max-length", dest="max_length", type=str, help="maximum segment length.") parser.add_argument("-m", "--merge-overlapping", dest="merge", action="store_true", help="merge overlapping bed segments.") parser.add_argument("-s", "--section", dest="section", type=str, choices=("segments", "annotations", "workspace"), help="annotator section.") parser.add_argument( "--subset", dest="subsets", type=str, action="append", help= "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids." ) parser.set_defaults( genome_file=None, feature=None, remove_random=True, section="segments", annotations="annotations", max_length=100000, files=[], subsets=[], input_filename_map=None, merge=False, ) (args, unknown) = E.start(parser, unknowns=True) args.files += unknown if len(args.files) == 0: args.files.append("-") args.files = list( itertools.chain(*[re.split("[,; ]+", x) for x in args.files])) if args.subsets: subsets = collections.defaultdict(list) for s in args.subsets: filename_gff, label, filename_ids = s.split(",") subsets[filename_gff].append((label, filename_ids)) args.subsets = subsets if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: fasta = None if args.section == "segments": prefix = "##Segs" elif args.section == "annotations": prefix = "##Id" elif args.section == "workspace": prefix = "##Work" else: raise ValueError("unknown section %s" % args.section) if args.max_length: max_length = args.max_length else: max_length = 0 ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0 if args.section in ("annotations"): contigs = set() it = itertools.groupby(Bed.iterator(args.stdin), key=lambda x: x.track["name"]) map_track2segments = {} for track, beds in it: ntracks += 1 map_track2segments[track] = [] first_segment = nsegments beds = list(beds) if args.merge: beds = Bed.merge(beds) for bed in beds: contig, start, end = bed.contig, bed.start, bed.end if args.remove_random and "random" in contig: continue if max_length > 0 and end - start > max_length: ndiscarded += 1 continue contigs.add(contig) map_track2segments[track].append(nsegments) args.stdout.write("%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end)) nsegments += 1 args.stdout.write("##Ann\t%s\t%s\n" % (track, "\t".join( ["%i" % x for x in range(first_segment, nsegments)]))) E.info("track %s: annotated with %i segments" % (track, nsegments - first_segment)) ncontigs = len(contigs) E.info( "ninput=%i, ntracks=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ntracks, ncontigs, nsegments, ndiscarded)) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--extract", dest="extract", type="string", help="extract region for testing purposes. Format is " "contig:strand:from:to. " "The default coordinates are 0-based " "open/closed coordinates on both strands, but can be changed " "by --input-format. " "For example, 'chr1:+:10:12' will return " "bases 11 and 12 on chr1. Elements from the end of the " "string can be omitted. For example, 'chr1' will return " "all of chromosome 'chr1'.") input_format_choices = ("one-forward-open", "zero-both-open") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=input_format_choices, help="coordinate format of input. Valid choices are " "%s. See --extract. [default=%%default]." % ", ".join(input_format_choices)) parser.add_option( "-s", "--synonyms", dest="synonyms", type="string", help="list of synonyms. This is a comma separated with list " "of equivalence relations. For example, chrM=chrMT " "means that chrMT will refer to chrM and either " "can be used to retrieve a sequence " "[default=%default]") group = E.OptionGroup(parser, "Bencharking options") group.add_option("-b", "--benchmark", dest="benchmark", action="store_true", help="benchmark time for read access " "[default=%default].") group.add_option("--benchmark-num-iterations", dest="benchmark_num_iterations", type="int", help="number of iterations for benchmark " "[default=%default].") group.add_option("--benchmark-fragment-size", dest="benchmark_fragment_size", type="int", help="benchmark: fragment size [default=%default].") parser.add_option_group(group) group = E.OptionGroup(parser, "Validation options") group.add_option("--verify", dest="verify", type="string", help="verify against other database [default=%default].") group.add_option("--verify-iterations", dest="verify_num_iterations", type="int", help="number of iterations for verification " "[default=%default].") parser.add_option_group(group) file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz") parser.add_option("--file-format", dest="file_format", type="choice", choices=file_format_choices, help="file format of input. Supply if data comes " "from stdin " "Valid choices are %s [default=%%default]." % ", ".join(file_format_choices)) parser.add_option("-a", "--clean-sequence", dest="clean_sequence", action="store_true", help="remove X/x from DNA sequences - they cause " "errors in exonerate [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="allow duplicate identifiers. Further occurances " "of an identifier are suffixed by an '_%i' " "[default=%default].") parser.add_option("--regex-identifier", dest="regex_identifier", type="string", help="regular expression for extracting the " "identifier from fasta description line " "[default=%default].") parser.add_option("--force-output", dest="force", action="store_true", help="force overwriting of existing files " "[default=%default].") translator_choices = ("solexa", "phred", "bytes", "range200") parser.add_option("-t", "--translator", dest="translator", type="choice", choices=translator_choices, help="translate numerical quality scores. " "Valid choices are %s [default=%%default]." % ", ".join(translator_choices)) group = E.OptionGroup(parser, 'Compression options') compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug") group.add_option("-c", "--compression", dest="compression", type="choice", choices=compression_choices, help="compress database, using specified compression " "method. " "Valid choices are %s, but depend on availability on the " "system " "[default=%%default]." % ", ".join(compression_choices)) group.add_option("--random-access-points", dest="random_access_points", type="int", help="set random access points every # number " "of nucleotides for block compression schemes " "[default=%default].") group.add_option( "--compress-index", dest="compress_index", action="store_true", help="compress index. The default is to use a plain-text, " "human-readable index [default=%default].") parser.add_option_group(group) parser.set_defaults(extract=None, input_format="zero-both-open", benchmark_fragment_size=1000, benchmark_num_iterations=1000000, benchmark=False, compression=None, random_access_points=0, synonyms=None, verify=None, verify_num_iterations=100000, verify_fragment_size=100, clean_sequence=False, allow_duplicates=False, regex_identifier=None, compress_index=False, file_format="auto", force=False, translator=None) (options, args) = E.start(parser) if options.synonyms: synonyms = {} for x in options.synonyms.split(","): a, b = x.split("=") a = a.strip() b = b.strip() if a not in synonyms: synonyms[a] = [] synonyms[a].append(b) else: synonyms = None if options.translator: if options.translator == "phred": options.translator = IndexedFasta.TranslatorPhred() elif options.translator == "solexa": options.translator = IndexedFasta.TranslatorSolexa() elif options.translator == "bytes": options.translator = IndexedFasta.TranslatorBytes() elif options.translator == "range200": options.translator = IndexedFasta.TranslatorRange200() else: raise ValueError("unknown translator %s" % options.translator) if options.extract: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.setTranslator(options.translator) converter = IndexedFasta.getConverter(options.input_format) contig, strand, start, end = IndexedFasta.parseCoordinates( options.extract) sequence = fasta.getSequence(contig, strand, start, end, converter=converter) options.stdout.write(">%s\n%s\n" % (options.extract, sequence)) elif options.benchmark: import timeit timer = timeit.Timer( stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" % (options.benchmark_fragment_size), setup="from cgat import IndexedFasta\n" "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0])) t = timer.timeit(number=options.benchmark_num_iterations) options.stdout.write("iter\tsize\ttime\n") options.stdout.write("%i\t%i\t%i\n" % (options.benchmark_num_iterations, options.benchmark_fragment_size, t)) elif options.verify: fasta1 = IndexedFasta.IndexedFasta(args[0]) fasta2 = IndexedFasta.IndexedFasta(options.verify) nerrors1 = IndexedFasta.verify(fasta1, fasta2, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors1)) nerrors2 = IndexedFasta.verify(fasta2, fasta1, options.verify_num_iterations, options.verify_fragment_size, stdout=options.stdout) options.stdout.write("errors=%i\n" % (nerrors2)) elif options.compress_index: fasta = IndexedFasta.IndexedFasta(args[0]) fasta.compressIndex() else: if options.loglevel >= 1: options.stdlog.write("# creating database %s\n" % args[0]) options.stdlog.write("# indexing the following files: \n# %s\n" % (" \n# ".join(args[1:]))) options.stdlog.flush() if synonyms: options.stdlog.write("# Applying the following synonyms:\n") for k, v in list(synonyms.items()): options.stdlog.write("# %s=%s\n" % (k, ",".join(v))) options.stdlog.flush() if len(args) < 2: print(globals()["__doc__"]) sys.exit(1) iterator = IndexedFasta.MultipleFastaIterator( args[1:], regex_identifier=options.regex_identifier, format=options.file_format) IndexedFasta.createDatabase( args[0], iterator, synonyms=synonyms, random_access_points=options.random_access_points, compression=options.compression, clean_sequence=options.clean_sequence, allow_duplicates=options.allow_duplicates, translator=options.translator, force=options.force) E.stop()
# coding: utf-8 # In[2]: import sys from cgat import Bed from cgat import IndexedFasta from cgatcore import iotools from cgat import Genomics # In[3]: genome = IndexedFasta.IndexedFasta("/shared/sudlab1/General/mirror/genomes/plain/hg38.fasta") # In[7]: bedfile = Bed.iterator(iotools.open_file(sys.argv[1])) splice_site_dict = dict() outfile = iotools.open_file(sys.argv[2], "w") for utron in bedfile: ss5_sequence = genome.getSequence(utron.contig, "+", utron.start, utron.start+2) ss3_sequence = genome.getSequence(utron.contig, "+", utron.end-2, utron.end) if utron.strand == "+": splice_site_dict[utron.name] = (ss5_sequence, ss3_sequence) if ":" in utron.name:
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-m", "--method", dest="methods", type=str, action="append", choices=("threshold", "stddev-above-mean", "multiple-of-mean"), help="method to apply") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-t", "--threshold", dest="threshold", type=float, help="threshold to apply") parser.add_argument("-i", "--bigwig-file", dest="bigwig_file", type=str, metavar="bigwig", help="filename with bigwig information.") parser.set_defaults(methods=[], genome_file=None, threshold=10, max_distance=0) (args) = E.start(parser, add_pipe_options=True) if args.bigwig_file: bigwig_file = pyBigWig.open(args.bigwig_file) else: bigwig_file = None if args.genome_file: genome_fasta = IndexedFasta.IndexedFasta(args.genome_file) contigs = genome_fasta.getContigSizes() for method in args.methods: if method == "threshold": if not contigs: raise ValueError("please supply contig sizes") if not bigwig_file: raise NotImplementedError( "threshold not implemented for wig files") processor = applyThreshold(bigwig_file, genome_fasta, threshold=args.threshold, max_distance=args.max_distance) elif method == "stddev-above-mean": if not contigs: raise ValueError("please supply contig sizes") if not bigwig_file: raise NotImplementedError( "threshold not implemented for wig files") summary = getBigwigSummary(args.bigwig_file) threshold = summary.mean + args.threshold * summary.std E.info("applying threshold %f: mean=%f, std=%f" % (threshold, summary.mean, summary.std)) processor = applyThreshold(bigwig_file, genome_fasta, threshold=threshold, max_distance=args.max_distance) elif method == "multiple-of-mean": if not contigs: raise ValueError("please supply contig sizes") if not bigwig_file: raise NotImplementedError( "threshold not implemented for wig files") summary = getBigwigSummary(args.bigwig_file) threshold = summary.mean * args.threshold E.info("applying threshold %f: mean=%f, std=%f" % (threshold, summary.mean, summary.std)) processor = applyThreshold(bigwig_file, genome_fasta, threshold=threshold, max_distance=args.max_distance) outfile = args.stdout outfile.write("".join(["%s\t%i\t%i\n" % x for x in processor])) outfile.write("\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--header-attributes", dest="header_attr", action="store_true", help="add GFF entry attributes to the FASTA record" " header section") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults( is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False, header_attr=False, ) (options, args) = E.start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with iotools.open_file(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = quicksect.IntervalTree() for start, end in e[contig]: intersector.add(start, end) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find( quicksect.Interval(start, end))] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) if options.header_attr: attributes = " ".join( [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()]) options.stdout.write( ">%s %s:%s:%s feature:%s %s\n%s\n" % (name, contig, strand, ";".join( ["%i-%i" % x for x in out]), chunk[0].feature, attributes, seq)) else: options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("--remove-regex", dest="remove_regex", type=str, help="regular expression of contigs to remove.") parser.add_argument("-e", "--gff-file", dest="gff_file", type=str, help="gff file to use for getting contig sizes.") parser.add_argument( "-f", "--fixed-width-windows", dest="fixed_width_windows", type=str, help="fixed width windows. Supply the window size as a " "parameter. Optionally supply an offset.") parser.set_defaults( genome_file=None, remove_regex=None, fixed_windows=None, ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) if args.remove_regex: remove_regex = re.compile(args.remove_regex) else: remove_regex = None if args.fixed_width_windows: v = list(map(int, args.fixed_width_windows.split(","))) if len(v) == 2: window_size, window_increment = v elif len(v) == 1: window_size, window_increment = v[0], v[0] else: raise ValueError( "could not parse window size '%s': should be size[,increment]" % args.fixed_width_windows) if args.gff_file: infile = iotools.open_file(args.gff_file, "r") gff = GTF.readFromFile(infile) infile.close() for g in gff: try: map_contig2size[g.mName] = max(map_contig2size[g.mName], g.end) except ValueError: map_contig2size[g.mName] = g.end else: gff = None if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) map_contig2size = fasta.getContigSizes(with_synonyms=False) else: fasta = None if map_contig2size is None: raise ValueError("no source of contig sizes supplied") # do sth counter = E.Counter() for contig, size in list(map_contig2size.items()): size = int(size) counter.input += 1 if remove_regex and remove_regex.search(contig): counter.skipped += 1 continue if args.fixed_width_windows: for x in range(0, size, window_increment): if x + window_size > size: continue args.stdout.write("%s\t%i\t%i\n" % (contig, x, min(size, x + window_size))) counter.windows += 1 else: args.stdout.write("%s\t%i\t%i\n" % (contig, 0, size)) counter.windows += 1 counter.output += 1 E.info(str(counter)) # write footer and output benchmark information. E.stop()
from cgat import IndexedFasta from cgatcore import iotools from cgat import Genomics from cgatcore import expriment as E parser = E.OptionParser(version="%prog version: $1.0$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome", dest="genome", help="index fasta genome sequence") parser.add_option("-O", "--per-utron-out", dest="outfile", help="File name for output file that will contain one row" "per entry in the input") options, args = E.start(parser, sys.argv) genome = IndexedFasta.IndexedFasta(options["genome"]) bedfile = Bed.iterator(options.stdin) splice_site_dict = dict() outfile = iotools.open_file(options["outfile"], "w") outfile.write("\t".join("transcript_id", "strand", "ss5", "ss3", "contig", "splice_site_start", "splice_site_end", "utron_size")) for utron in bedfile:
def buildUngappedContigBed(infile, outfiles): ''' Constructs :term:`BED` format files containing both gapped and ungapped contig sizes from an index genome :term:`fasta` file. Parameters ---------- infile: str infile is constructed from `PARAMS` variable to retrieve the `genome` :term:`fasta` file assembly_gaps_min_size: int `PARAMS` - the minimum size (in nucleotides) for an assembly gap Returns ------- outfiles: list two separate :term:`BED` format output files containing the contig sizes for contigs with and without gaps. The names are defined in the `PARAMS` `interface_contigs_ungapped_bed` and `interface_gaps_bed` parameters. ''' prefix = P.snip(infile, ".fasta") fasta = IndexedFasta.IndexedFasta(prefix) outs_nogap = iotools.open_file(outfiles[0], "w") outs_gap = iotools.open_file(outfiles[1], "w") min_gap_size = PARAMS["assembly_gaps_min_size"] for contig, size in fasta.getContigSizes(with_synonyms=False).items(): seq = fasta.getSequence(contig) def gapped_regions(seq): is_gap = seq[0] == "N" last = 0 for x, c in enumerate(seq): if c == "N": if not is_gap: last = x is_gap = True else: if is_gap: yield (last, x) last = x is_gap = False if is_gap: yield last, size last_end = 0 for start, end in gapped_regions(seq): if end - start < min_gap_size: continue if last_end != 0: outs_nogap.write("%s\t%i\t%i\n" % (contig, last_end, start)) outs_gap.write("%s\t%i\t%i\n" % (contig, start, end)) last_end = end if last_end < size: outs_nogap.write("%s\t%i\t%i\n" % (contig, last_end, size)) outs_nogap.close() outs_gap.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id", usage=globals()["__doc__"]) parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string", help="UCSC genome identifier [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("--extend", dest="extension", type="int", help="extend tags by this number of bases " "[default=%default].") parser.add_option("--shift-size", dest="shift", type="int", help="shift tags by this number of bases " "[default=%default].") parser.add_option("--window-size", dest="window_size", type="int", help="window size to be used in the analysis" "[default=%default].") parser.add_option("--saturation-iterations", dest="saturation_iterations", type="int", help="iterations for saturation analysis " "[default=%default].") parser.add_option("-t", "--toolset", dest="toolset", type="choice", action="append", choices=("saturation", "coverage", "enrichment", "dmr", "rms", "rpm", "all", "convert"), help="actions to perform [default=%default].") parser.add_option("-w", "--bigwig-file", dest="bigwig", action="store_true", help="store wig files as bigwig files - requires a " "genome file [default=%default]") parser.add_option("--treatment", dest="treatment_files", type="string", action="append", help="BAM files for treatment. At least one is required " "[%default]") parser.add_option("--control", dest="control_files", type="string", action="append", help="BAM files for control for differential " "methylation analysis. Optional [%default].") parser.add_option("--input", dest="input_files", type="string", action="append", help="BAM files for input correction. " "Optional [%default].") parser.add_option("--is-not-medip", dest="is_medip", action="store_false", help="data is not MeDIP data and is not expected " "to fit the calibration model. No CpG " "density normalized rms data is computed" "[default=%default].") parser.add_option("--output-rdata", dest="output_rdata", action="store_true", help="in dmr analysis, write R session to file. " "The file name " "is given by --ouptut-filename-pattern [%default].") parser.add_option("--rdata-file", dest="input_rdata", type="string", help="in dmr analysis, read saved R session from " "file. This can be used to apply different " "filters [%default]") parser.add_option("--fdr-threshold", dest="fdr_threshold", type="float", help="FDR threshold to apply for selecting DMR " "[default=%default].") parser.add_option("--fdr-method", dest="fdr_method", type="choice", choices=("bonferroni", "BH", "holm", "hochberg", "hommel", "BY", "fdr", "none"), help="FDR method to apply for selecting DMR " "[default=%default].") parser.add_option("--bwa", dest="bwa", action="store_true", help="alignment generated with bwa" "[default=%default].") parser.add_option("--unique", dest="unique", type="float", help="Threshold p-value to determine which read pile\ ups are the result of PCR overamplification" "[default=%default].") parser.add_option("--chroms", dest="chroms", type="str", help="Comma delimited list of chromosomes to include" "[default=%default].") parser.set_defaults(input_format="bam", ucsc_genome="Hsapiens.UCSC.hg19", genome_file=None, extend=0, shift=0, window_size=300, saturation_iterations=10, toolset=[], bigwig=False, treatment_files=[], control_files=[], input_files=[], output_rdata=False, input_rdata=None, is_medip=True, fdr_threshold=0.1, fdr_method="BH", bwa=False, unique=0.001, chroms=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if "convert" in options.toolset: results = [] for line in csv.DictReader(options.stdin, dialect="excel-tab"): if line['edgeR.p.value'] == "NA": continue # assumes only a single treatment/control treatment_name = options.treatment_files[0] control_name = options.control_files[0] status = "OK" try: results.append( Expression.GeneExpressionResult._make(( "%s:%i-%i" % (line['chr'], int(line['start']), int(line['stop'])), treatment_name, float(line['MSets1.rpkm.mean']), 0, control_name, float(line['MSets2.rpkm.mean']), 0, float(line['edgeR.p.value']), float(line['edgeR.adj.p.value']), float(line['edgeR.logFC']), math.pow(2.0, float(line['edgeR.logFC'])), float(line['edgeR.logFC']), # no transform ["0", "1"][float(line['edgeR.adj.p.value']) < options.fdr_threshold], status))) except ValueError as msg: raise ValueError("parsing error %s in line: %s" % (msg, line)) Expression.writeExpressionResults(options.stdout, results) return if len(options.treatment_files) < 1: raise ValueError("please specify a filename with sample data") if options.bigwig and not options.genome_file: raise ValueError("please provide a genome file when outputting bigwig") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() if len(options.toolset) == 0: options.toolset = ["all"] do_all = "all" in options.toolset if options.chroms is None: chrstring = "" else: chroms = options.chroms.split(",") chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms) # load MEDIPS R.library('MEDIPS') genome_file = 'BSgenome.%s' % options.ucsc_genome R.library(genome_file) window_size = options.window_size extend = options.extend shift = options.shift saturation_iterations = options.saturation_iterations uniq = float(options.unique) if options.bwa is True: BWA = "TRUE" else: BWA = "FALSE" if "saturation" in options.toolset or do_all: E.info("saturation analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''sr = MEDIPS.saturation( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, uniq=%(uniq)s, nit = %(saturation_iterations)i, paired = %(paired)s, bwa = %(BWA)s, %(chrstring)s nrit = 1)''' % locals()) R.png(E.get_output_file("%s_saturation.png" % fn)) R('''MEDIPS.plotSaturation(sr)''') R('''dev.off()''') R('''write.table(sr$estimation, file ='%s', sep='\t')''' % E.get_output_file("%s_saturation_estimation.tsv" % fn)) outfile = iotools.open_file( E.get_output_file("%s_saturation.tsv" % fn), "w") outfile.write("category\tvalues\n") outfile.write("estimated_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxEstCor''')])) outfile.write("true_correlation\t%s\n" % ",".join(["%f" % x for x in R('''sr$maxTruCor''')])) outfile.write("nreads\t%s\n" % ",".join(["%i" % x for x in R('''sr$numberReads''')])) outfile.close() if "coverage" in options.toolset or do_all: E.info("CpG coverage analysis") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''cr = MEDIPS.seqCoverage( file='%(fn)s', BSgenome='%(genome_file)s', pattern='CG', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''') R('''dev.off()''') R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn)) R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr, type = "hist", t=15)''') R('''dev.off()''') # note: this file is large R('''write.table(cr$cov.res, file=gzfile('%s','w'), sep='\t')''' % E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn)) if 'enrichment' in options.toolset or do_all: E.info("CpG enrichment analysis") outfile = iotools.open_file(E.get_output_file("enrichment.tsv.gz"), "w") slotnames = (("regions.CG", "regions_CG", "%i"), ("regions.C", "regions_C", "%s"), ("regions.G", "regions_G", "%f"), ("regions.relH", "regions_relH", "%i"), ("regions.GoGe", "regions_GoGe", "%i"), ("genome.CG", "genome_CG", "%s"), ("genome.C", "genome_C", "%s"), ("genome.G", "genome_G", "%i"), ("genome.relH", "genome_relH", "%i"), ("enrichment.score.relH", "enrichment_relH", "%s"), ("enrichment.score.GoGe", "enrichment_GoGe", "%s")) outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n") for fn in options.treatment_files + options.control_files: paired = isPaired(fn) R('''ce = MEDIPS.CpGenrich( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) outfile.write("%s" % fn) for slotname, label, pattern in slotnames: value = tuple(R('''ce$%s''' % slotname)) if len(value) == 0: value = "" outfile.write("\t%s" % pattern % value[0]) outfile.write("\n") outfile.close() if options.input_rdata: E.info("reading R session info from '%s'" % options.input_rdata) R('''load('%s')''' % options.input_rdata) else: if "dmr" in options.toolset or "correlation" in options.toolset \ or do_all: # build four sets for x, fn in enumerate(options.treatment_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''treatment_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''treatment_set = c(%s)''' % ",".join([ "treatment_R%i" % x for x in range(len(options.treatment_files)) ])) if options.control_files: for x, fn in enumerate(options.control_files): paired = isPaired(fn) E.info("loading '%s'" % fn) R('''control_R%(x)i = MEDIPS.createSet( file='%(fn)s', BSgenome='%(genome_file)s', shift=%(shift)i, extend=%(extend)i, window_size=%(window_size)i, paired=%(paired)s, bwa=%(BWA)s, %(chrstring)s uniq=%(uniq)s)''' % locals()) R('''control_set = c(%s)''' % ",".join([ "control_R%i" % x for x in range(len(options.control_files)) ])) # build coupling vector R('''CS = MEDIPS.couplingVector(pattern="CG", refObj = treatment_set[[1]])''') if "correlation" in options.toolset or do_all: R('''cor.matrix = MEDIPS.correlation( c(treatment_set, control_set))''') R('''write.table(cor.matrix, file='%s', sep="\t")''' % E.get_output_file("correlation")) if "dmr" in options.toolset or do_all: # Data that does not fit the model causes # "Error in 1:max_signal_index : argument of length 0" # The advice is to set MeDIP=FALSE # See: http://comments.gmane.org/ # gmane.science.biology.informatics.conductor/52319 if options.is_medip: medip = "TRUE" else: medip = "FALSE" fdr_method = options.fdr_method E.info("applying test for differential methylation") R('''meth = MEDIPS.meth( MSet1 = treatment_set, MSet2 = control_set, CSet = CS, ISet1 = NULL, ISet2 = NULL, p.adj = "%(fdr_method)s", diff.method = "edgeR", MeDIP = %(medip)s, CNV = F, minRowSum = 1)''' % locals()) # Note: several Gb in size # Output full methylation data table R('''write.table(meth, file=gzfile('%s', 'w'), sep="\t", row.names=F, quote=F)''' % E.get_output_file("data.tsv.gz")) # save R session if options.output_rdata: R('''save.image(file='%s', safe=FALSE)''' % E.get_output_file("session.RData")) # DMR analysis - test for windows and output if "dmr" in options.toolset: E.info("selecting differentially methylated windows") # test windows for differential methylation fdr_threshold = options.fdr_threshold R('''tested = MEDIPS.selectSig(meth, adj=T, ratio=NULL, p.value=%(fdr_threshold)f, bg.counts=NULL, CNV=F)''' % locals()) R('''write.table(tested, file=gzfile('%s', 'w'), sep="\t", quote=F)''' % E.get_output_file("significant_windows.gz")) # select gain and merge adjacent windows try: R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),]; gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''') E.info('gain output: %s, merged: %s' % (str(R('''dim(gain)''')), str(R('''dim(gain_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(gain_merged, file=of, sep="\t", quote=F, row.names=FALSE, col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute gain windows: msg=%s" % msg) # select loss and merge adjacent windows try: R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),]; loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''') E.info('loss output: %s, merged: %s' % (str(R('''dim(loss)''')), str(R('''dim(loss_merged)''')))) R('''of=gzfile('%s', 'w'); write.table(loss_merged, file=of, sep="\t", quote=F, row.names=F, col.names=F); close(of)''' % E.get_output_file("loss.bed.gz")) except rpy2.rinterface.RRuntimeError as msg: E.warn("could not compute loss windows: msg=%s" % msg) # if "rpm" in options.toolset or do_all: # outputfile = E.get_output_file("rpm.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = T, descr = "rpm")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # if "rms" in options.toolset or do_all: # outputfile = E.get_output_file("rms.wig") # R('''MEDIPS.exportWIG(file = '%(outputfile)s', # data = CONTROL.SET, raw = F, descr = "rms")''' % # locals()) # if options.bigwig: # bigwig(outputfile, contig_sizes) # else: # compress(outputfile) # write footer and output benchmark information. E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-m", "--method", dest="method", type=str, choices=("add-flank", "add-upstream-flank", "add-downstream-flank", "crop", "crop-unique", "complement-groups", "combine-groups", "filter-range", "join-features", "merge-features", "sanitize", "to-forward-coordinates", "to-forward-strand", "rename-chr"), help="method to apply ") parser.add_argument("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf.") parser.add_argument("-c", "--contigs-tsv-file", dest="input_filename_contigs", type=str, help="filename with contig lengths.") parser.add_argument( "--agp-file", dest="input_filename_agp", type=str, help="agp file to map coordinates from contigs to scaffolds.") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("--crop-gff-file", dest="filename_crop_gff", type=str, help="GFF/GTF file to crop against.") parser.add_argument( "--group-field", dest="group_field", type=str, help="""gff field/attribute to group by such as gene_id, " "transcript_id, ... .""") parser.add_argument( "--filter-range", dest="filter_range", type=str, help="extract all elements overlapping a range. A range is " "specified by eithor 'contig:from..to', 'contig:+:from..to', " "or 'from,to' .") parser.add_argument("--sanitize-method", dest="sanitize_method", type=str, choices=("ucsc", "ensembl", "genome"), help="method to use for sanitizing chromosome names. " ".") parser.add_argument( "--flank-method", dest="flank_method", type=str, choices=("add", "extend"), help="method to use for adding flanks. ``extend`` will " "extend existing features, while ``add`` will add new features. " ".") parser.add_argument("--skip-missing", dest="skip_missing", action="store_true", help="skip entries on missing contigs. Otherwise an " "exception is raised .") parser.add_argument( "--contig-pattern", dest="contig_pattern", type=str, help="a comma separated list of regular expressions specifying " "contigs to be removed when running method sanitize .") parser.add_argument( "--assembly-report", dest="assembly_report", type=str, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-hasids", dest="assembly_report_hasIDs", type=int, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-ucsccol", dest="assembly_report_ucsccol", type=int, help="column in the assembly report containing ucsc contig ids" ".") parser.add_argument( "--assembly-report-ensemblcol", dest="assembly_report_ensemblcol", type=int, help="column in the assembly report containing ensembl contig ids") parser.add_argument( "--assembly-extras", dest="assembly_extras", type=str, help="additional mismatches between gtf and fasta to fix when" "sanitizing the genome .") parser.add_argument("--extension-upstream", dest="extension_upstream", type=float, help="extension for upstream end .") parser.add_argument("--extension-downstream", dest="extension_downstream", type=float, help="extension for downstream end .") parser.add_argument("--min-distance", dest="min_distance", type=int, help="minimum distance of features to merge/join .") parser.add_argument("--max-distance", dest="max_distance", type=int, help="maximum distance of features to merge/join .") parser.add_argument("--min-features", dest="min_features", type=int, help="minimum number of features to merge/join .") parser.add_argument("--max-features", dest="max_features", type=int, help="maximum number of features to merge/join .") parser.add_argument( "--rename-chr-file", dest="rename_chr_file", type=str, help="mapping table between old and new chromosome names." "TAB separated 2-column file.") parser.set_defaults(input_filename_contigs=False, filename_crop_gff=None, input_filename_agp=False, genome_file=None, rename_chr_file=None, add_up_flank=None, add_down_flank=None, complement_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, min_distance=0, max_distance=0, min_features=1, max_features=0, extension_upstream=1000, extension_downstream=1000, sanitize_method="ucsc", flank_method="add", output_format="%06i", skip_missing=False, is_gtf=False, group_field=None, contig_pattern=None, assembly_report=None, assembly_report_hasIDs=1, assembly_report_ensemblcol=4, assembly_report_ucsccol=9, assembly_extras=None) (args) = E.start(parser, argv=argv) contigs = None genome_fasta = None chr_map = None if args.input_filename_contigs: contigs = Genomics.readContigSizes( iotools.open_file(args.input_filename_contigs, "r")) if args.genome_file: genome_fasta = IndexedFasta.IndexedFasta(args.genome_file) contigs = genome_fasta.getContigSizes() if args.rename_chr_file: chr_map = {} with open(args.rename_chr_file, 'r') as filein: reader = csv.reader(filein, delimiter='\t') for row in reader: if len(row) != 2: raise ValueError( "Mapping table must have exactly two columns") chr_map[row[0]] = row[1] if not len(chr_map.keys()) > 0: raise ValueError("Empty mapping dictionnary") if args.assembly_report: df = pd.read_csv(args.assembly_report, comment="#", header=None, sep="\t") # fixes naming inconsistency in assembly report: ensembl chromosome # contigs found in columnn 0, ensembl unassigned contigs found in # column 4. if args.assembly_report_hasIDs == 1: ucsccol = args.assembly_report_ucsccol ensemblcol = args.assembly_report_ensemblcol df.loc[df[1] == "assembled-molecule", ensemblcol] = df.loc[df[1] == "assembled-molecule", 0] if args.sanitize_method == "ucsc": assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict() elif args.sanitize_method == "ensembl": assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict() else: raise ValueError(''' When using assembly report, please specify sanitize method as either "ucsc" or "ensembl" to specify direction of conversion ''') else: assembly_dict = {} if args.assembly_extras is not None: assembly_extras = args.assembly_extras.split(",") for item in assembly_extras: item = item.split("-") assembly_dict[item[0]] = item[1] if args.method in ("forward_coordinates", "forward_strand", "add-flank", "add-upstream-flank", "add-downstream-flank") \ and not contigs: raise ValueError("inverting coordinates requires genome file") if args.input_filename_agp: agp = AGP.AGP() agp.readFromFile(iotools.open_file(args.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(args.stdin) if args.method in ("add-upstream-flank", "add-downstream-flank", "add-flank"): add_upstream_flank = "add-upstream-flank" == args.method add_downstream_flank = "add-downstream-flank" == args.method if args.method == "add-flank": add_upstream_flank = add_downstream_flank = True upstream_flank = int(args.extension_upstream) downstream_flank = int(args.extension_downstream) extend_flank = args.flank_method == "extend" if args.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, args.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(key=lambda x: (x.contig, x.start)) lcontig = contigs[chunk[0].contig] if extend_flank: if add_upstream_flank: if is_positive: chunk[0].start = max(0, chunk[0].start - upstream_flank) else: chunk[-1].end = min(lcontig, chunk[-1].end + upstream_flank) if add_downstream_flank: if is_positive: chunk[-1].end = min(lcontig, chunk[-1].end + downstream_flank) else: chunk[0].start = max(0, chunk[0].start - downstream_flank) else: if add_upstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - upstream_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + upstream_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if add_downstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + downstream_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - downstream_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: args.stdout.write(str(gff) + "\n") elif args.method == "complement-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: if args.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start args.stdout.write(str(x) + "\n") x.start = c.end elif args.method == "combine-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" args.stdout.write(str(x) + "\n") elif args.method == "join-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=False, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "merge-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=True, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "crop": for gff in cropGFF(gffs, args.filename_crop_gff): args.stdout.write(str(gff) + "\n") elif args.method == "crop-unique": for gff in cropGFFUnique(gffs): args.stdout.write(str(gff) + "\n") elif args.method == "filter-range": contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", args.filter_range).groups() except AttributeError: raise "can not parse range %s" % args.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None E.debug("filter: contig=%s, strand=%s, interval=%s" % (str(contig), str(strand), str(interval))) for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): args.stdout.write(str(gff) + "\n") elif args.method == "sanitize": def assemblyReport(id): if id in assembly_dict.keys(): id = assembly_dict[id] # if not in dict, the contig name is forced # into the desired convention, this is helpful user # modified gff files that contain additional contigs elif args.sanitize_method == "ucsc": if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id elif args.sanitize_method == "ensembl": if id.startswith("contig"): return id[len("contig"):] elif id.startswith("chr"): return id[len("chr"):] return id if args.sanitize_method == "genome": if genome_fasta is None: raise ValueError("please specify --genome-file= when using " "--sanitize-method=genome") f = genome_fasta.getToken else: if args.assembly_report is None: raise ValueError( "please specify --assembly-report= when using " "--sanitize-method=ucsc or ensembl") f = assemblyReport skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError: if args.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if args.contig_pattern: to_remove = [ re.compile(x) for x in args.contig_pattern.split(",") ] if any([x.search(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue args.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len(list(skipped_contigs.keys())), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(list( outofrange_contigs.keys())), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len(list(filtered_contigs.keys())), str(filtered_contigs))) elif args.method == "rename-chr": if not chr_map: raise ValueError("please supply mapping file") for gff in renameChromosomes(gffs, chr_map): args.stdout.write(str(gff) + "\n") else: for gff in gffs: if args.method == "forward_coordinates": gff.invert(contigs[gff.contig]) if args.method == "forward_strand": gff.invert(contigs[gff.contig]) gff.strand = "+" if agp: # note: this works only with forward coordinates gff.contig, gff.start, gff.end = agp.mapLocation( gff.contig, gff.start, gff.end) args.stdout.write(str(gff) + "\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with Samtools indexed genome [default=%default].") parser.add_option("-w", "--window-size", dest="window", type="int", help="Window size for tiling [default=%default].") parser.add_option("-s", "--shift-size", dest="shift", type="int", help="Window shift for tiling [default=%default].") parser.set_defaults( genome_file=None, window=1000, shift=1000, output_file=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) ninput, nunchanged, nchanged = 0, 0, 0 # Open input file E.info("Opening input file: %s" % options.genome_file) fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes(with_synonyms=False) # Open output file bed = options.stdout shift = options.shift # Loop over input files and convert to soft clipped nwindows = 0 ncontigs = 0 for contig, stop in contigs.items(): ncontigs += 1 i = 0 while (i < stop): j = min(i + options.window, stop) #bed.write( """%(contig)s\t%(i)i\t%(j)i\t%(contig)s:%(i)i..%(j)i\n""" % locals() ) bed.write("""%(contig)s\t%(i)i\t%(j)i\n""" % locals()) nwindows += 1 i += shift # Report statistics E.info("ncontigs=%i, nwindows=%i" % (ncontigs, nwindows)) # write footer and output benchmark information. E.stop()
def main(argv=None): if not argv: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-i", "--ignore-missing", dest="ignore_missing", action="store_true", help="Ignore transcripts on contigs that are not " "in the genome-file.") parser.add_argument("-s", "--restrict-source", dest="restrict_source", type=str, choices=("protein_coding", "pseudogene", "lncRNA"), help="restrict input by source.") parser.add_argument("-m", "--method", dest="method", type=str, choices=( "full", "genome", "exons", "promotors", "tts", "regulons", "tts-regulons", "genes", "territories", "tss-territories", "great-domains", ), help="method for defining segments.") parser.add_argument("-r", "--territory-extension", dest="radius", type=int, help="radius of a territory.") parser.add_argument("-f", "--flank-size", dest="flank", type=int, help="size of the flanking region next to a gene.") parser.add_argument( "--flank-increment-size", dest="increment", type=int, help="size of increment in flank in genestructure annotation ") parser.add_argument("-p", "--promotor-size", dest="promotor", type=int, help="size of a promotor region.") parser.add_argument("-u", "--upstream-extension", dest="upstream", type=int, help="size of region upstream of tss.") parser.add_argument("-d", "--downstream-extension", dest="downstream", type=int, help="size of region downstream of tss.") parser.add_argument("--gene-detail", dest="detail", type=str, choices=("introns+exons", "exons", "introns"), help="level of detail for gene structure annotation ") parser.add_argument("--merge-overlapping-promotors", dest="merge_promotors", action="store_true", help="merge overlapping promotors.") parser.add_argument( "--min-intron-length", dest="min_intron_length", type=int, help="minimum intron length. If the distance between two " "consecutive exons is smaller, the region will be marked " "'unknown'.") parser.add_argument( "--is-unsorted", dest="is_sorted", action="store_false", help="sort input before processing. Otherwise, the input is assumed " "to be sorted.") parser.set_defaults( genome_file=None, flank=1000, increment=1000, max_frameshift_length=4, min_intron_length=30, ignore_missing=False, restrict_source=None, method="genome", radius=50000, promotor=5000, merge_promotors=False, upstream=5000, downstream=5000, detail="exons", is_sorted=True, ) (args) = E.start(parser) if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: raise ValueError("please specify a --genome-file") if not args.restrict_source: iterator = GTF.iterator(args.stdin) elif args.restrict_source: iterator = GTF.iterator_filtered(GTF.iterator(args.stdin), source=args.restrict_source) # elif options.method in ("promotors", "tts", "regulons"): # iterator = GTF.iterator_filtered( GTF.iterator(options.stdin), source = "protein_coding") # else: # iterator = GTF.iterator(options.stdin) if not args.is_sorted: iterator = GTF.iterator_sorted(iterator, sort_order="position") if args.method == "full" or args.method == "genome": segmentor = annotateGenome(iterator, fasta, args) elif args.method == "territories": segmentor = buildTerritories(iterator, fasta, 'gene', args) elif args.method == "tss-territories": segmentor = buildTerritories(iterator, fasta, 'tss', args) elif args.method == "exons": segmentor = annotateExons(iterator, fasta, args) elif args.method == "promotors": segmentor = annotatePromoters(iterator, fasta, args) elif args.method == "regulons": segmentor = annotateRegulons(iterator, fasta, True, args) elif args.method == "tts-regulons": segmentor = annotateRegulons(iterator, fasta, False, args) elif args.method == "tts": segmentor = annotateTTS(iterator, fasta, args) elif args.method == "genes": segmentor = annotateGenes(iterator, fasta, args) elif args.method == "great-domains": segmentor = annotateGREATDomains(iterator, fasta, args) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2fasta.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-i", "--ignore-missing", dest="ignore_missing", action="store_true", help= "Ignore transcripts on contigs that are not in the genome-file [default=%default]." ) parser.add_option( "--min-intron-length", dest="min_intron_length", type="int", help= "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default]." ) parser.add_option("-m", "--method", dest="method", type="choice", choices=("full", ), help="method to apply [default=%default].") parser.set_defaults( genome_file=None, flank=1000, max_frameshift_length=4, min_intron_length=30, ignore_missing=False, restrict_source=None, method="full", report_step=1000, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if not options.genome_file: raise ValueError("an indexed genome is required.") fasta = IndexedFasta.IndexedFasta(options.genome_file) iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) annotateGenome(iterator, fasta, options) # write footer and output benchmark information. E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument( "-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument( "-a", "--aggregate-by", dest="aggregate", type=str, choices=("name", "contig", "track", "none"), help="aggregate counts by feature.") parser.add_argument( "-p", "--add-percent", dest="add_percent", action="store_true", help="add percentages.") parser.set_defaults( genome_file=None, aggregate="none", add_percent=False, ) (args) = E.start(parser, argv) # get files if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: if args.add_percent: raise ValueError("--add-percent option requires --genome-file") fasta = None if args.add_percent and not args.aggregate == "contig": raise NotImplementedError( "--add-percent option requires --aggregate=contig") counts = collections.defaultdict(Counter) total = Counter() output_totals = True if args.aggregate == "track": keyf = lambda x: x.track elif args.aggregate == "name": keyf = lambda x: x.name elif args.aggregate == "contig": keyf = lambda x: x.contig else: keyf = lambda x: "all" output_totals = False for bed in Bed.iterator(args.stdin): counts[keyf(bed)].add(bed) total.add(bed) outf = args.stdout key = "track" if args.add_percent: outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers_percent))) else: outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers))) total_bases = 0 for key, count in sorted(counts.items()): if args.add_percent: total_bases += fasta.getLength(key) count.setSize(fasta.getLength(key)) outf.write("%s\t%s\n" % (key, str(count))) if output_totals: if args.add_percent: count.setSize(total_bases) outf.write("%s\t%s\n" % ("total", str(total))) E.stop()