def addIntergenicSegment(last, this, fasta, options): """add an intergenic segment between last and this. At telomeres, either can be None. """ if not this and not last: return 0 nadded = 0 if not this: # last telomere try: lcontig = fasta.getLength(last.contig) except KeyError as msg: if options.ignore_missing: return nadded else: raise KeyError(msg) flank = min(last.end + options.flank, lcontig) nadded += addFlank(last.end, flank, last, options) nadded += addSegment("telomeric", flank, lcontig, last, options) elif not last: # first telomere flank = max(0, this.start - options.flank) nadded += addSegment("telomeric", 0, flank, this, options) nadded += addFlank(flank, this.start, this, options) else: # intergenic region d = this.start - last.end flank = options.flank if d > flank * 2: nadded += addFlank(last.end, last.end + flank, last, options) nadded += addSegment("intergenic", last.end + flank, this.start - flank, (last, this), options) nadded += addFlank(this.start - flank, this.start, this, options) else: # add short flank between two genes. If they can not agree # on the directionality, "flank" is used. is_positive1 = Genomics.IsPositiveStrand(last.strand) is_positive2 = Genomics.IsPositiveStrand(this.strand) if is_positive1 and not is_positive2: key = "3flank" elif not is_positive1 and is_positive2: key = "5flank" else: key = "flank" nadded += addSegment(key, last.end, this.start, (last, this), options) return nadded
def toSequence(chunk, fasta): """convert a list of gff attributes to a single sequence. This function ensures correct in-order concatenation on positive/negative strand. Overlapping regions are merged. """ if len(chunk) == 0: return "" contig, strand = chunk[0].contig, chunk[0].strand for gff in chunk: assert gff.strand == strand, "features on different strands." assert gff.contig == contig, "features on different contigs." intervals = Intervals.combine([(x.start, x.end) for x in chunk]) lcontig = fasta.getLength(contig) positive = Genomics.IsPositiveStrand(strand) if not positive: intervals = [(lcontig - end, lcontig - start) for start, end in intervals] intervals.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] return "".join(s)
def addFlank(start, end, template, options): """add a flank. """ is_positive = Genomics.IsPositiveStrand(template.strand) is_before = end <= template.start if (is_before and is_positive) or (not is_before and not is_positive): name = "5flank" else: name = "3flank" return addSegment(name, start, end, template, options)
def updateVariants(variants, lcontig, strand, phased=True): '''update variants such that they use same coordinate system (and strand) as the transcript fixes 1-ness of variants ''' new_variants = [] is_positive = Genomics.IsPositiveStrand(strand) for variant in variants: pos = variant.pos genotype = bytes(variant.genotype) reference = bytes(variant.reference) # fix 1-ness of variants # pos -= 1 if len(genotype) == 1: variantseqs = list(Genomics.decodeGenotype(genotype)) has_wildtype = reference in variantseqs action = "=" start, end = pos, pos + 1 else: variantseqs = [x[1:] for x in genotype.split("/")] lvariant = max([len(x) for x in variantseqs]) if not phased: variantseqs = [x for x in variantseqs if x] has_wildtype = "*" in genotype if "+" in genotype and "-" in genotype: # both insertion and deletion at position # the range is given by the deletion # see below for explanations if genotype.startswith("+"): action = ">" variantseqs[1] += "-" * (lvariant - len(variantseqs[1])) else: action = "<" variantseqs[0] += "-" * (lvariant - len(variantseqs[0])) start, end = pos + 1, pos + lvariant + 1 elif "-" in genotype: action = "-" # samtools: deletions are after the base denoted by snp.position # * <- deletion at 1 # 0 1 2 3 4 5 6 # - - # 6 5 4 3 2 1 0 # deletion of 2+3 = (2,4) # on reverse: (7-4, 7-2) = (3,5) start, end = pos + 1, pos + lvariant + 1 # deletions of unequal length are filled up with "-" # This is necessary to deal with negative strands: # -at/-atg on the positive strand deletes a t [g] # -at/-atg on the negative strand deletes [g] t a variantseqs = [ x + "-" * (lvariant - len(x)) for x in variantseqs ] elif "+" in genotype: action = "+" # indels are after the base denoted by position # as region use both flanking base so that negative strand # coordinates work # insertion between position 2 and 3 # * <- insection at pos 2 # 0 1 2i3 4 # 4 3 2i1 0 # is insertion between 1 and 2 in reverse # including both flanking residues makes it work: # (2,3) = (5-3,5-2) = (2,3) # but: # (2,4) = (5-4,5-2) = (1,3) start, end = pos, pos + 2 # revert strand if not is_positive: reference = Genomics.complement(reference) variantseqs = [Genomics.complement(x.upper()) for x in variantseqs] start, end = lcontig - end, lcontig - start new_variants.append( ExtendedVariant._make((start, end, reference.upper(), action, has_wildtype, variantseqs))) return new_variants
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-t", "--tablename", dest="tablename", type="string", help= "tablename to get variants from (in samtools pileup format) [default=%default]." ) parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option( "-f", "--exons-file", dest="filename_exons", type="string", help= "filename with transcript model information (gtf formatted file) [default=%default]." ) parser.add_option( "-r", "--filename-reference", dest="filename_reference", type="string", help= "filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default]." ) parser.add_option( "--vcf-file", dest="filename_vcf", type="string", help= "filename with variants in VCF format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--pileup-file", dest="filename_pileup", type="string", help= "filename with variants in samtools pileup format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--vcf-sample", dest="vcf_sample", type="string", help= "sample id for species of interest in vcf formatted file [default=%default]." ) parser.add_option( "-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help= "filename of a list of transcript ids that are selenoproteins [default=%default]." ) parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option( "-k", "--with-knockouts", dest="with_knockouts", action="store_true", help= "add alleles that are knocked out to fasta and gtf files [default=%default]." ) parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite(options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF(options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join(("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# collected variants:", variants) # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# merged variants:", variants) # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences(indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print("exon", key) Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print("intron", key) Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles( transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip( allele.exon_starts[1:], allele.cds_starts[1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write(">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ("\t".join( (gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults(is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with IOTools.openFile(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.Stop()
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE): """annotate a genome given by the indexed *fasta* file and an iterator over gtf annotations. """ annotations = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * array.array("c").itemsize)) # AString.AString( "a").itemsize )) for contig, size in contig_sizes.items(): E.debug("allocating %s: %i bases" % (contig, size)) # annotations[contig] = AString.AString( default_code * size ) annotations[contig] = array.array("c", default_code * size) E.info("allocated memory for %i contigs" % len(fasta)) counter = E.Counter() # output splice junctions outfile_junctions = E.openOutputFile("junctions") outfile_junctions.write( "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n") for gtfs in iterator: counter.input += 1 if counter.input % options.report_step == 0: E.info("iteration %i" % counter.input) try: contig = fasta.getToken(gtfs[0].contig) except KeyError, msg: E.warn("contig %s not found - annotation ignored" % gtfs[0].contig) counter.skipped_contig += 1 continue lcontig = fasta.getLength(contig) # make sure that exons are sorted by coordinate gtfs.sort(key=lambda x: x.start) is_positive = Genomics.IsPositiveStrand(gtfs[0].strand) source = gtfs[0].source # process non-coding data if source in MAP_ENSEMBL: code = MAP_ENSEMBL[source] intervals = [(x.start, x.end) for x in gtfs] addSegments(annotations[contig], intervals, is_positive, code) elif source == "protein_coding": # collect exons for utr exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"] cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"] if len(cds) == 0: counter.skipped_transcripts += 1 E.warn("protein-coding transcript %s without CDS - skipped" % gtfs[0].transcript_id) continue exons = Intervals.truncate(exons, cds) start, end = cds[0][0], cds[-1][1] UTR5 = [x for x in exons if x[1] < start] UTR3 = [x for x in exons if x[0] >= end] if not is_positive: UTR5, UTR3 = UTR3, UTR5 splice_code = "S" else: splice_code = "s" addSegments(annotations[contig], UTR5, is_positive, "u") addIntrons(annotations[contig], UTR5, is_positive, options.max_frameshift_length) addSegments(annotations[contig], UTR3, is_positive, "v") addIntrons(annotations[contig], UTR3, is_positive, options.max_frameshift_length) # output CDS according to frame addCDS(annotations[contig], [x for x in gtfs if x.feature == "CDS"], is_positive) # add introns between CDS addIntrons(annotations[contig], cds, is_positive, options.max_frameshift_length) # output splice junctions cds = [x for x in gtfs if x.feature == "CDS"] # apply corrections for 1-past end coordinates # to point between residues within CDS if is_positive: ender = lambda x: x.end - 1 starter = lambda x: x.start out_positive = "+" else: ender = lambda x: lcontig - x.start - 1 starter = lambda x: lcontig - x.end out_positive = "-" cds.reverse() end = ender(cds[0]) for c in cds[1:]: start = starter(c) outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (contig, out_positive, end, start, c.frame, c.gene_id, c.transcript_id, )) end = ender(c)
def process(matches): new = matches[0].copy() map_query2target = alignlib_lite.py_makeAlignmentBlocks() graph = networkx.DiGraph() graph.add_nodes_from(range(len(matches) + 2)) matches.sort(key=lambda x: x.mQueryFrom) if Genomics.IsPositiveStrand(matches[0].strand): f = lambda x, y: x.mSbjctTo < y.mSbjctFrom else: f = lambda x, y: x.mSbjctFrom > y.mSbjctTo for x in range(0, len(matches)): xx = matches[x] if options.loglevel >= 6: options.stdlog.write("# graph: %2i %s\n" % (x, str(xx))) for y in range(x + 1, len(matches)): yy = matches[y] d = min(xx.mQueryTo, yy.mQueryTo) - \ max(xx.mQueryFrom, yy.mQueryFrom) if d > 0 or not f(xx, yy): continue else: graph.add_edge(x, y, {'weight': -d}) source = len(matches) target = len(matches) + 1 for x in range(len(matches)): xx = matches[x] graph.add_edge(source, x, {'weight': xx.mQueryFrom}) graph.add_edge( x, target, {'weight': xx.mQueryLength - xx.mQueryTo}) if options.loglevel >= 6: networkx.write_edgelist(graph, options.stdlog) path = networkx.dijkstra_path(graph, source, target) if options.loglevel >= 6: options.stdlog.write("# path: %s\n" % (str(path))) new_matches = [matches[x] for x in path[1:-1]] if len(matches) != len(new_matches): E.warn(("query=%s, target=%s, strand=%s: " "removed overlapping/out-of-order segments: " "before=%i, after=%i") % (matches[0].mQueryId, matches[0].mSbjctId, matches[0].strand, len(matches), len(new_matches))) matches = new_matches for match in matches: m = match.getMapQuery2Target() alignlib_lite.py_addAlignment2Alignment(map_query2target, m) new.fromMap(map_query2target, use_strand=True) options.stdout.write(str(new) + "\n") options.stdout.flush() return 1
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: gff2gff.py$", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="method", type="choice", choices=("add-flank", "add-upstream-flank", "add-downstream-flank", "crop", "crop-unique", "complement-groups", "combine-groups", "filter-range", "join-features", "merge-features", "sanitize", "to-forward-coordinates", "to-forward-strand"), help="method to apply [%default]") parser.add_option("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf [default=%default].") parser.add_option("-c", "--contigs-tsv-file", dest="input_filename_contigs", type="string", help="filename with contig lengths.") parser.add_option( "--agp-file", dest="input_filename_agp", type="string", help="agp file to map coordinates from contigs to scaffolds.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--crop-gff-file", dest="filename_crop_gff", type="string", help="GFF/GTF file to crop against.") parser.add_option( "--group-field", dest="group_field", type="string", help="""gff field/attribute to group by such as gene_id, " "transcript_id, ... [%default].""") parser.add_option( "--filter-range", dest="filter_range", type="string", help="extract all elements overlapping a range. A range is " "specified by eithor 'contig:from..to', 'contig:+:from..to', " "or 'from,to' .") parser.add_option("--sanitize-method", dest="sanitize_method", type="choice", choices=("ucsc", "ensembl", "genome"), help="method to use for sanitizing chromosome names. " "[%default].") parser.add_option( "--flank-method", dest="flank_method", type="choice", choices=("add", "extend"), help="method to use for adding flanks. ``extend`` will " "extend existing features, while ``add`` will add new features. " "[%default].") parser.add_option("--skip-missing", dest="skip_missing", action="store_true", help="skip entries on missing contigs. Otherwise an " "exception is raised [%default].") parser.add_option( "--contig-pattern", dest="contig_pattern", type="string", help="a comma separated list of regular expressions specifying " "contigs to be removed when running method sanitize [%default].") parser.add_option( "--assembly-report", dest="assembly_report", type="string", help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize [%default].") parser.add_option( "--assembly-report-hasids", dest="assembly_report_hasIDs", type="int", help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize [%default].") parser.add_option( "--assembly-report-ucsccol", dest="assembly_report_ucsccol", type="int", help="column in the assembly report containing ucsc contig ids" "[%default].") parser.add_option( "--assembly-report-ensemblcol", dest="assembly_report_ensemblcol", type="int", help="column in the assembly report containing ensembl contig ids" "[%default].") parser.add_option( "--assembly-extras", dest="assembly_extras", type="str", help="additional mismatches between gtf and fasta to fix when" "sanitizing the genome [%default].") parser.add_option("--extension-upstream", dest="extension_upstream", type="float", help="extension for upstream end [%default].") parser.add_option("--extension-downstream", dest="extension_downstream", type="float", help="extension for downstream end [%default].") parser.add_option( "--min-distance", dest="min_distance", type="int", help="minimum distance of features to merge/join [%default].") parser.add_option( "--max-distance", dest="max_distance", type="int", help="maximum distance of features to merge/join [%default].") parser.add_option( "--min-features", dest="min_features", type="int", help="minimum number of features to merge/join [%default].") parser.add_option( "--max-features", dest="max_features", type="int", help="maximum number of features to merge/join [%default].") parser.set_defaults(input_filename_contigs=False, filename_crop_gff=None, input_filename_agp=False, genome_file=None, add_up_flank=None, add_down_flank=None, complement_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, min_distance=0, max_distance=0, min_features=1, max_features=0, extension_upstream=1000, extension_downstream=1000, sanitize_method="ucsc", flank_method="add", output_format="%06i", skip_missing=False, is_gtf=False, group_field=None, contig_pattern=None, assembly_report=None, assembly_report_hasIDs=1, assembly_report_ensemblcol=4, assembly_report_ucsccol=9, assembly_extras=None) (options, args) = E.Start(parser, argv=argv) contigs = None genome_fasta = None if options.input_filename_contigs: contigs = Genomics.readContigSizes( IOTools.openFile(options.input_filename_contigs, "r")) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = genome_fasta.getContigSizes() if options.assembly_report: df = pd.read_csv(options.assembly_report, comment="#", header=None, sep="\t") # fixes naming inconsistency in assembly report: ensembl chromosome # contigs found in columnn 0, ensembl unassigned contigs found in # column 4. if options.assembly_report_hasIDs == 1: ucsccol = options.assembly_report_ucsccol ensemblcol = options.assembly_report_ensemblcol df.ix[df[1] == "assembled-molecule", ensemblcol] = df.ix[df[1] == "assembled-molecule", 0] if options.sanitize_method == "ucsc": assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict() elif options.sanitize_method == "ensembl": assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict() else: raise ValueError(''' When using assembly report, please specify sanitize method as either "ucsc" or "ensembl" to specify direction of conversion ''') else: assembly_dict = {} if options.assembly_extras is not None: assembly_extras = options.assembly_extras.split(",") for item in assembly_extras: item = item.split("-") assembly_dict[item[0]] = item[1] if options.method in ("forward_coordinates", "forward_strand", "add-flank", "add-upstream-flank", "add-downstream-flank") \ and not contigs: raise ValueError("inverting coordinates requires genome file") if options.input_filename_agp: agp = AGP.AGP() agp.readFromFile(IOTools.openFile(options.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(options.stdin) if options.method in ("add-upstream-flank", "add-downstream-flank", "add-flank"): add_upstream_flank = "add-upstream-flank" == options.method add_downstream_flank = "add-downstream-flank" == options.method if options.method == "add-flank": add_upstream_flank = add_downstream_flank = True upstream_flank = int(options.extension_upstream) downstream_flank = int(options.extension_downstream) extend_flank = options.flank_method == "extend" if options.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, options.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(key=lambda x: (x.contig, x.start)) lcontig = contigs[chunk[0].contig] if extend_flank: if add_upstream_flank: if is_positive: chunk[0].start = max(0, chunk[0].start - upstream_flank) else: chunk[-1].end = min(lcontig, chunk[-1].end + upstream_flank) if add_downstream_flank: if is_positive: chunk[-1].end = min(lcontig, chunk[-1].end + downstream_flank) else: chunk[0].start = max(0, chunk[0].start - downstream_flank) else: if add_upstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - upstream_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + upstream_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if add_downstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + downstream_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - downstream_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: options.stdout.write(str(gff) + "\n") elif options.method == "complement-groups": iterator = GTF.joined_iterator(gffs, group_field=options.group_field) for chunk in iterator: if options.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start options.stdout.write(str(x) + "\n") x.start = c.end elif options.method == "combine-groups": iterator = GTF.joined_iterator(gffs, group_field=options.group_field) for chunk in iterator: chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" options.stdout.write(str(x) + "\n") elif options.method == "join-features": for gff in combineGFF(gffs, min_distance=options.min_distance, max_distance=options.max_distance, min_features=options.min_features, max_features=options.max_features, merge=False, output_format=options.output_format): options.stdout.write(str(gff) + "\n") elif options.method == "merge-features": for gff in combineGFF(gffs, min_distance=options.min_distance, max_distance=options.max_distance, min_features=options.min_features, max_features=options.max_features, merge=True, output_format=options.output_format): options.stdout.write(str(gff) + "\n") elif options.method == "crop": for gff in cropGFF(gffs, options.filename_crop_gff): options.stdout.write(str(gff) + "\n") elif options.method == "crop-unique": for gff in cropGFFUnique(gffs): options.stdout.write(str(gff) + "\n") elif options.method == "filter-range": contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match( "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", options.filter_range).groups() except AttributeError: raise "can not parse range %s" % options.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None E.debug("filter: contig=%s, strand=%s, interval=%s" % (str(contig), str(strand), str(interval))) for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): options.stdout.write(str(gff) + "\n") elif options.method == "sanitize": def assemblyReport(id): if id in assembly_dict.keys(): id = assembly_dict[id] # if not in dict, the contig name is forced # into the desired convention, this is helpful user # modified gff files that contain additional contigs elif options.sanitize_method == "ucsc": if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id elif options.sanitize_method == "ensembl": if id.startswith("contig"): return id[len("contig"):] elif id.startswith("chr"): return id[len("chr"):] return id if options.sanitize_method == "genome": if genome_fasta is None: raise ValueError("please specify --genome-file= when using " "--sanitize-method=genome") f = genome_fasta.getToken else: if options.assembly_report is None: raise ValueError( "please specify --assembly-report= when using " "--sanitize-method=ucsc or ensembl") f = assemblyReport skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError: if options.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if options.contig_pattern: to_remove = [ re.compile(x) for x in options.contig_pattern.split(",") ] if any([x.search(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue options.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len(list(skipped_contigs.keys())), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(list( outofrange_contigs.keys())), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len(list(filtered_contigs.keys())), str(filtered_contigs))) else: for gff in gffs: if options.method == "forward_coordinates": gff.invert(contigs[gff.contig]) if options.method == "forward_strand": gff.invert(contigs[gff.contig]) gff.strand = "+" if agp: # note: this works only with forward coordinates gff.contig, gff.start, gff.end = agp.mapLocation( gff.contig, gff.start, gff.end) options.stdout.write(str(gff) + "\n") E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gff2gff.py 2868 2010-03-03 10:19:52Z andreas $") parser.add_option("-f", "--forward-coordinates", dest="forward_coordinates", help="translate to forward coordinates.", action="store_true") parser.add_option("--forward-strand", dest="forward_strand", help="convert to forward strand.", action="store_true") parser.add_option("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf [default=%default].") parser.add_option( "--add-up-flank", dest="add_up_flank", type="int", help="add an upstream flanking segment to first exon of a group.") parser.add_option( "--add-down-flank", dest="add_down_flank", type="int", help="add a downstream flanking segment to last segment of a group.") parser.add_option("--extend", dest="extend", help="extend the existing features.", action="store_true") parser.add_option("-c", "--contigs", dest="input_filename_contigs", type="string", help="filename with contig lenghts.") parser.add_option( "--filename-agp", dest="input_filename_agp", type="string", help="agp file to map coordinates from contigs to scaffolds.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option( "--complement-groups", dest="complement_groups", action="store_true", help="""complement groups. Will write introns from exons [%default].""" ) parser.add_option( "--group-field", dest="group_field", type="string", help= """gff field/attribute to group by such as gene_id, transrcipt_id, ... [%default].""" ) parser.add_option("--combine-groups", dest="combine_groups", action="store_true", help="""combine groups.""") parser.add_option( "--filter-range", dest="filter_range", type="string", help= """extract all elements overlapping a range. A range is specified by eithor 'contig:from..to', 'contig:+:from..to', or 'from,to' .""" ) parser.add_option( "--join-features", dest="join_features", type="string", help= "join features into a single transcript. Consecutive features are grouped " " into the same transcript/gene. This metdo expects a string of for numbers ``a,b,c,d`` " " as input with:" " a,b=minimum/maximum distance between features, " " c,d=minimum,maximum number of features." "") parser.add_option( "--merge-features", dest="merge_features", type="string", help= "merge features. Consecutive features are merged into a single feature. " "This method expects a string of four numbers ``a,b,c,d`` as input; " "a,b=minimum/maximum distance between features, " "c,d=minimum,maximum number of features.") parser.add_option( "--crop-unique", dest="crop_unique", action="store_true", help= "crop overlapping intervals, keeping only intervals that are unique [default=%default]" ) parser.add_option( "--crop", dest="crop", type="string", help= """crop features in gff file with features in another file. If a feature falls in the middle of another, two entries will be output.""" ) parser.add_option( "--sanitize", dest="sanitize", type="choice", choices=("ucsc", "ensembl", "genome"), help= "sanitize chr names for ucsc or ensembl or use the genome translator [%default]." ) parser.add_option( "--skip-missing", dest="skip_missing", action="store_true", help= "skip entries on missing contigs. Otherwise an exception is raised [%default]." ) parser.add_option( "--remove-contigs", dest="remove_contigs", type="string", action="store", help= "a comma separated list of regular expressions specifying contigs to be removed when runnnig sanitize [%default]." ) parser.set_defaults( forward_coordinates=False, forward_strand=False, input_filename_contigs=False, input_filename_agp=False, genome_file=None, sanitize=None, add_up_flank=None, add_down_flank=None, extend=False, complement_groups=False, combine_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, join_features=None, merge_features=None, output_format="%06i", skip_missing=False, remove_contigs=None, is_gtf=False, group_field=None, ) (options, args) = E.Start(parser, argv=argv) if options.input_filename_contigs: contigs = Genomics.ReadContigSizes( IOTools.openFile(options.input_filename_contigs, "r")) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = genome_fasta.getContigSizes() else: genome_fasta = None if (options.forward_coordinates or options.forward_strand) and not contigs: raise ValueError("inverting coordinates requires genome file") if options.input_filename_agp: agp = AGP.AGP() agp.readFromFile(IOTools.openFile(options.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(options.stdin) if options.add_up_flank or options.add_down_flank: if options.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, options.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(lambda x, y: cmp(x.start, y.start)) lcontig = contigs[chunk[0].contig] if options.extend: if options.add_up_flank: if is_positive: chunk[0].start = max( 0, chunk[0].start - options.add_up_flank) else: chunk[-1].end = min( lcontig, chunk[-1].end + options.add_up_flank) if options.add_down_flank: if is_positive: chunk[-1].end = min( lcontig, chunk[-1].end + options.add_down_flank) else: chunk[0].start = max( 0, chunk[0].start - options.add_down_flank) else: if options.add_up_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - options.add_up_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + options.add_up_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if options.add_down_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + options.add_up_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - options.add_up_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: options.stdout.write(str(gff) + "\n") elif options.complement_groups: iterator = GTF.joined_iterator(gffs, group_field=options.group_field) for chunk in iterator: if options.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort() x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start options.stdout.write(str(x) + "\n") x.start = c.end elif options.combine_groups: iterator = GTF.joined_iterator(gffs) for chunk in iterator: chunk.sort() x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" options.stdout.write(str(x) + "\n") elif options.join_features: combineGFF(gffs, options, merge=False) elif options.merge_features: combineGFF(gffs, options, merge=True) elif options.crop: cropGFF(gffs, options) elif options.crop_unique: cropGFFUnique(gffs, options) elif options.filter_range: contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match( "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", options.filter_range).groups() except AttributeError: raise "can not parse range %s" % options.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None if options.loglevel >= 2: options.stdlog.write( "# filter: contig=%s, strand=%s, interval=%s\n" % (str(contig), str(strand), str(interval))) options.stdlog.flush() for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): options.stdout.write(str(gff) + "\n") elif options.sanitize: def toUCSC(id): if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id return id def toEnsembl(id): if id.startswith("contig"): return id[len("contig"):] if id.startswith("chr"): return id[len("chr"):] return id if options.sanitize == "genome": if genome_fasta is None: raise ValueError( "please specify --genome-file= when using --sanitize=genome" ) f = genome_fasta.getToken elif options.sanitize == "ucsc": f = toUCSC elif options.sanitize == "ensembl": f = toEnsembl skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError, msg: if options.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if options.remove_contigs: to_remove = [ re.compile(x) for x in options.remove_contigs.split(",") ] if any([x.match(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue options.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len( skipped_contigs.keys()), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(outofrange_contigs.keys()), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len( filtered_contigs.keys()), str(filtered_contigs)))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: snp2maf.py 2875 2010-03-27 17:42:04Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-t", "--tracks", dest="tracks", type="string", action="append", help="tracks (tablenames) to use in sqlite database [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option("-r", "--reference", dest="reference", type="string", help="name of reference [default=%default].") parser.add_option("-i", "--is-gtf", dest="is_gtf", action="store_true", help="if set, the gene_id will be added to the alignment header [default=%default].") parser.add_option("-z", "--compress", dest="compress", action="store_true", help="compress output with gzip [default=%default].") parser.add_option("-p", "--pattern-identifier", dest="pattern_track", type="string", help="regular expression pattern for track [default=%default].") parser.set_defaults( genome_file=None, tracks=[], database="csvdb", output=[], border=0, reference_name="reference", pattern_track="(\S+)", is_gtf=True, compress=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if not options.database or not options.tracks: raise ValueError("please supply both database and tracks") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.is_gtf: infile_gff = GTF.iterator(options.stdin) else: infile_gff = GTF.iterator(options.stdin) dbhandle = sqlite3.connect(options.database) statement = '''SELECT pos, reference, genotype FROM %(track)s WHERE contig = '%(contig)s' AND pos BETWEEN %(extended_start)s and %(extended_end)s ''' counts = E.Counter() tracks = options.tracks try: translated_tracks = [ re.search(options.pattern_track, track).groups()[0] for track in tracks] except AttributeError: raise AttributeError( "pattern `%s` does not match input tracks." % options.pattern_track) if options.compress: outfile = gzip.GzipFile(fileobj=options.stdout) else: outfile = options.stdout outfile.flush() outfile.write("##maf version=1 program=snp2maf.py\n\n") for gff in infile_gff: counts.input += 1 contig = gff.contig strand = gff.strand lcontig = fasta.getLength(contig) region_start, region_end = gff.start, gff.end if contig.startswith("chr"): contig = contig[3:] extended_start = region_start - options.border extended_end = region_end + options.border is_positive = Genomics.IsPositiveStrand(strand) E.info("processing %s" % str(gff)) # collect all variants all_variants = [] for track in options.tracks: cc = dbhandle.cursor() cc.execute(statement % locals()) all_variants.append(list(map(Variants.Variant._make, cc.fetchall()))) cc.close() E.debug("%s:%i..%i collected %i variants for %i tracks" % (contig, region_start, region_end, sum([ len(x) for x in all_variants]), len(all_variants))) reference_seq = fasta.getSequence( contig, "+", region_start, region_end) lseq = len(reference_seq) alleles = collections.defaultdict(list) # build allele sequences for track and count maximum chars per mali # column colcounts = numpy.ones(lseq) for track, variants in zip(translated_tracks, all_variants): variants = Variants.updateVariants(variants, lcontig, "+") a = Variants.buildAlleles(reference_seq, variants, reference_start=region_start) alleles[track] = a for allele in a: for pos, c in enumerate(allele): colcounts[pos] = max(colcounts[pos], len(c)) # realign gapped regions alignIndels(alleles, colcounts) if options.is_gtf: outfile.write("a gene_id=%s\n" % gff.gene_id) else: outfile.write("a\n") maf_format = "s %(name)-30s %(pos)9i %(size)6i %(strand)s %(lcontig)9i %(seq)s\n" def __addGaps(sequence, colcounts): '''output gapped sequence.''' r = [] for x, c in enumerate(sequence): r.append(c + "-" * (colcounts[x] - len(c))) return "".join(r) name = ".".join((options.reference, contig)) if is_positive: pos = region_start else: pos = lcontig - region_start size = lseq seq = __addGaps(reference_seq, colcounts) outfile.write(maf_format % (locals())) for track in translated_tracks: for aid, allele in enumerate(alleles[track]): seq = __addGaps(allele, colcounts) if not is_positive: Genomics.complement(seq) size = len(seq) - seq.count("-") name = ".".join((track + "-%i" % aid, contig)) outfile.write(maf_format % (locals())) outfile.write("\n") E.info("%s" % str(counts)) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gff2fasta.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-m", "--merge", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes. " "[default=%default]") parser.add_option( "-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'. If " "set to the empty string, all entries are output [%default].") parser.add_option( "-f", "--filename-masks", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file [%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option( "--min-length", dest="min_length", type="int", help="set minimum length for sequences output [%default]") parser.add_option( "--max-length", dest="max_length", type="int", help="set maximum length for sequences output [%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.set_defaults(is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, masker=None) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(sys.stdin)) else: gffs = GTF.iterator(sys.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with open(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GFF.iterator(infile)) # convert intervals to intersectors for contig in e.keys(): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # for item in iterator: # print len(item) # 3, 2 # for i in item: # print len(i) # 9, 9, 9, 9, 9 # print i.contig # print i.strand # print i.transcript_id # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = filter(lambda x: x.feature == feature, ichunk) else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from %s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise "unimplemented" if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: %s: regions=%s masks=%s\n" %\ (name, str([ (x.start, x.end) for x in chunk ]), masked_regions) ) continue out = intervals if options.extend_at: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] #IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if l < options.min_length or (options.max_length and l > options.max_length): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because length out of bounds %s: regions=%s len=%i\n" %\ (name, str(intervals), l) ) continue options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), "\n".join(s))) noutput += 1 E.info( "ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, nskipped_masked=%i, nskipped_length=%i" %\ (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length ) ) E.Stop()
flank = max(0, this.start - options.flank) nadded += addSegment("telomeric", 0, flank, this, options) nadded += addFlank(flank, this.start, this, options) else: # intergenic region d = this.start - last.end flank = options.flank if d > flank * 2: nadded += addFlank(last.end, last.end + flank, last, options) nadded += addSegment("intergenic", last.end + flank, this.start - flank, (last, this), options) nadded += addFlank(this.start - flank, this.start, this, options) else: # add short flank between two genes. If they can not agree # on the directionality, "flank" is used. is_positive1 = Genomics.IsPositiveStrand(last.strand) is_positive2 = Genomics.IsPositiveStrand(this.strand) if is_positive1 and not is_positive2: key = "3flank" elif not is_positive1 and is_positive2: key = "5flank" else: key = "flank" nadded += addSegment(key, last.end, this.start, (last, this), options) return nadded # -----------------------------------------------------------------------------