def updateIndels(self, snp, is_negative_strand): contig = snp.chromosome lcontig = self.mFasta.getLength(contig) # get location of insertion/deletion. The location # is after position, hence get position and position + 1 code = self.mAnnotations.getSequence(contig, "+", snp.pos, snp.pos + 2) self.mCode = code variants = snp.genotype.split("/") for variant in variants: if variant[0] == "*": self.mVariantType.append("W") elif variant[0] == "+": toinsert = variant[1:] self.mVariantType.append("I") elif variant[0] == "-": todelete = variant[1:] # deletions need to be looked at in a wider range self.mVariantType.append("D") else: raise ValueError("unknown variant sign '%s'" % variant[0]) # ignore non-coding Indels if code[0] and code[1] not in 'abcABC': return if is_negative_strand: variants = [Genomics.complement(x) for x in variants] for reference_codon in self.mReferenceCodons: variants = snp.genotype.split("/") variants = [x[1:] for x in variants] for variant in variants: if len(variant) % 3 != 0: self.mVariantCodons.append("!") else: self.mVariantCodons.append(variant) self.mVariantAAs.extend( [Genomics.translate(x) for x in self.mVariantCodons])
def countMotifs(infile, motifs): '''find regular expression *motifs* in sequences within fasta formatted *infile*. ''' it = FastaIterator.FastaIterator(infile) positions = [] while 1: try: seq = it.next() except StopIteration: break if not seq: break rseq = Genomics.complement(seq.sequence) lsequence = len(seq.sequence) pos = [] for motif, pattern in motifs: for x in pattern.finditer(seq.sequence): pos.append((motif, "+", x.start(), x.end())) for x in pattern.finditer(rseq): pos.append( (motif, "-", lsequence - x.end(), lsequence - x.start())) positions.append((seq.title, pos)) return positions
def updateSNPs(self, snp, is_negative_strand, pos): """update SNPs.""" contig = snp.chromosome lcontig = self.mFasta.getLength(contig) reference_base = snp.reference_base if snp.genotype in "ACGTacgt": # homozygous substitution self.mVariantType.append("O") else: # heterozygous substitution self.mVariantType.append("E") # switch reference strand codon to correct strand if reference_base != "*" and is_negative_strand: reference_base = Genomics.complement(reference_base) # collect all possible variants of reference codons for reference_codon in self.mReferenceCodons: self.mReferenceAAs.append(Genomics.translate(reference_codon)) # process single base changes variant_bases = Genomics.resolveAmbiguousNA(snp.genotype) if reference_codon[pos] != reference_base: raise ValueError( "base mismatch at %i (codon=%s,%i): codon:%s != genome:%s; `%s`" % (snp.pos, reference_codon, pos, reference_codon[pos], reference_base, ";".join(map(str, snp))) ) for variant_base in variant_bases: if is_negative_strand: variant_base = Genomics.complement(variant_base) self.mVariantAAs.extend([Genomics.translate(x) for x in self.mVariantCodons])
def updateSNPs(self, snp, is_negative_strand, pos): '''update SNPs.''' contig = snp.chromosome lcontig = self.mFasta.getLength(contig) reference_base = snp.reference_base if snp.genotype in 'ACGTacgt': # homozygous substitution self.mVariantType.append("O") else: # heterozygous substitution self.mVariantType.append("E") # switch reference strand codon to correct strand if reference_base != "*" and is_negative_strand: reference_base = Genomics.complement(reference_base) # collect all possible variants of reference codons for reference_codon in self.mReferenceCodons: self.mReferenceAAs.append(Genomics.translate(reference_codon)) # process single base changes variant_bases = Genomics.resolveAmbiguousNA(snp.genotype) if reference_codon[pos] != reference_base: raise ValueError("base mismatch at %i (codon=%s,%i): codon:%s != genome:%s; `%s`" % (snp.pos, reference_codon, pos, reference_codon[pos], reference_base, ";".join(map(str, snp)))) for variant_base in variant_bases: if is_negative_strand: variant_base = Genomics.complement(variant_base) self.mVariantAAs.extend([Genomics.translate(x) for x in self.mVariantCodons])
def GetMapSequences(sequences): map_cluster2sequence = {} map_sequence2cluster = {} # cluster sequences by identity # (clumsy sort, use hashes for bigger sets) for key, sequence in sequences.items(): h = Genomics.GetHID(sequence) if h not in map_cluster2sequence: map_cluster2sequence[h] = [] map_sequence2cluster[key] = h map_cluster2sequence[h].append(key) return map_cluster2sequence, map_sequence2cluster
def countSites(model): """count number of expected synonymous/nonsynonymous sites in a grammar. """ ## number of synonymous/non-synonymous sites n, s = 0.0, 0.0 xpi = model.evaluateTerminalFrequencies()[('COD0', 'COD1', 'COD2')] pi = {} for codon, f in xpi.items(): pi["".join(codon).upper()] = f ## translate pi and the matrix to codons for key, value in pi.items(): del pi[key] pi["".join(key).upper()] = value for codon, freq in pi.items(): try: degeneracy = Genomics.GetDegeneracy(codon) except KeyError: continue for x in range(1, 4): d = (degeneracy[x] - 1.0) / 3.0 s += freq * d n += freq * (1.0 - d) ## if degeneracy[x] > 1: ## s += freq ## else: ## n += freq assert (float("%5.2f" % (n + s)) == 3.0) ## print s / (n+s) ## n = 184.9 ## s = 76.1 ## t = n + s ## n /= t ## s /= t ## print s / (n+s) return n, s
def getCopy(self): """return a new copy. """ new_entry = Prediction() new_entry.mExpand = self.mExpand new_entry.mPredictionId = self.mPredictionId new_entry.mQueryToken = self.mQueryToken new_entry.mQueryFrom = self.mQueryFrom new_entry.mQueryTo = self.mQueryTo new_entry.mSbjctToken = self.mSbjctToken new_entry.mSbjctStrand = self.mSbjctStrand new_entry.mSbjctFrom = self.mSbjctFrom new_entry.mSbjctTo = self.mSbjctTo new_entry.mRank = self.mRank new_entry.score = self.score new_entry.mQueryLength = self.mQueryLength new_entry.mQueryCoverage = self.mQueryCoverage new_entry.mNGaps = self.mNGaps new_entry.mNFrameShifts = self.mNFrameShifts new_entry.mNIntrons = self.mNIntrons new_entry.mNSplits = self.mNSplits new_entry.mNStopCodons = self.mNStopCodons new_entry.mPercentIdentity = self.mPercentIdentity new_entry.mPercentSimilarity = self.mPercentSimilarity new_entry.mTranslation = self.mTranslation new_entry.mSbjctGenomeFrom = self.mSbjctGenomeFrom new_entry.mSbjctGenomeTo = self.mSbjctGenomeTo new_entry.mAlignmentString = self.mAlignmentString new_entry.mQueryAli = self.mQueryAli new_entry.mSbjctAli = self.mSbjctAli if self.mExpand: new_entry.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector( ) alignlib_lite.py_copyAlignment(new_entry.mMapPeptide2Translation, self.mMapPeptide2Translation) new_entry.mMapPeptide2Genome = Genomics.String2Alignment( new_entry.mAlignmentString) else: new_entry.mMapPeptide2Translation = self.mMapPeptide2Translation = None new_entry.mMapPeptide2Genome = self.mMapPeptide2Genome = None return new_entry
def ClusterPeptidesByHid(peptides): """cluster peptide sequences by hid.""" map_cluster2peptide = {} map_peptide2cluster = {} # cluster peptides by identity # (clumsy sort, use hashes for bigger sets) for key, sequence in peptides.items(): h = Genomics.GetHID(sequence) if h not in map_cluster2peptide: map_cluster2peptide[h] = [] map_peptide2cluster[key] = h map_cluster2peptide[h].append(key) return map_cluster2peptide, map_peptide2cluster
def annotatePromoters( iterator, fasta, options ): """annotate promoters within iterator. Entries specied with ``--restrict-source`` are annotated. """ gene_iterator = GTF.gene_iterator( iterator ) ngenes, ntranscripts, npromotors = 0, 0, 0 for gene in gene_iterator: ngenes += 1 is_negative_strand = Genomics.IsNegativeStrand( gene[0][0].strand ) lcontig = fasta.getLength( gene[0][0].contig ) promotors = [] transcript_ids = [] for transcript in gene: ntranscripts += 1 mi, ma = min( [x.start for x in transcript ] ), max( [x.end for x in transcript ] ) transcript_ids.append( transcript[0].transcript_id ) # if tss is directly at start/end of contig, the tss will be within an exon. # otherwise, it is outside an exon. if is_negative_strand: promotors.append( (min( lcontig-options.promotor, ma), min(lcontig, ma + options.promotor)) ) else: promotors.append( (max(0,mi - options.promotor), max(options.promotor,mi)) ) if options.merge_promotors: # merge the promotors (and rename - as sort order might have changed) promotors = Intervals.combine( promotors ) transcript_ids = ["%i" % (x+1) for x in range(len(promotors) )] gtf = GTF.Entry() gtf.fromGTF( gene[0][0], gene[0][0].gene_id, gene[0][0].gene_id ) gtf.source = "promotor" x = 0 for start, end in promotors: gtf.start, gtf.end = start, end gtf.transcript_id = transcript_ids[x] options.stdout.write( "%s\n" % str(gtf) ) npromotors += 1 x += 1 E.info( "ngenes=%i, ntranscripts=%i, npromotors=%i" % (ngenes, ntranscripts, npromotors) )
def transform_third_codon(start, end, intervals_with_gff): """transform: only return nucleotide positions in window (start, end) that are in third codon position. """ intervals = [] for istart, iend, gff in intervals_with_gff: if gff.frame == ".": raise ValueError("need a frame for third codon positions.") # frame = nucleotides from start to next codon frame = int(gff.frame) # to make life easier, convert to 0-based coordinates, # with zero starting at first position in window # re-arrange positions on negative strand if Genomics.IsNegativeStrand(gff.strand): # convert to negative strand coordinates counting from 0 coordinate_offset = end reverse = True istart, iend = end - iend, end - istart else: istart, iend = istart - start, iend - start reverse = False coordinate_offset = start # make sure that you start on a second codon position and within window if istart < 0: frame = (frame + istart) % 3 istart = 0 if frame != 0: istart -= (3 - frame) istart += 2 iend = min(iend, end - start) for x in range(istart, iend, 3): if reverse: c = coordinate_offset - x - 1 else: c = coordinate_offset + x intervals.append((c, c + 1)) return Intervals.combineIntervals(intervals)
def getEntropy(self, usage=None): """return entropy of a source in terms of a reference usage. Also called conditional entropy or encoding cost. Note that here I compute the sum over 20 entropies, one for each amino acid. If not given, calculate entropy. """ e = 0 freqs = Genomics.CalculateCodonFrequenciesFromCounts( self.mCodonCounts, self.mPseudoCounts) if usage is None: usage = freqs for codon, count in list(self.mCodonCounts.items()): e -= freqs[codon] * math.log(usage[codon]) return e
def fillFromTable(self, table_row): if len(table_row) == 25: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString) = table_row elif len(table_row) == 26: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] elif len(table_row) > 26: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] else: raise ValueError, "unknown format: %i fields" % len(data) sys.exit(0) if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector( ) if self.mQueryAli != "" and self.mSbjctAli != "": alignlib_lite.py_AlignmentFormatEmissions( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli).copy(self.mMapPeptide2Translation) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString)
def GetBlocks(self, s1, s2): """the strings have to be already aligned!!!""" handle_tmpfile, filename_tmpfile = tempfile.mkstemp() os.write(handle_tmpfile, ">s1\n%s\n" % (s1)) os.write(handle_tmpfile, ">s2\n%s\n" % (s2)) os.close(handle_tmpfile) statement = " ".join("(", self.mEnvironment, self.mExecutable % filename_tmpfile, self.mOptions, ")") p = subprocess.Popen(statement, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) (file_stdout, file_stdin, file_stderr) = (p.stdin, p.stdout, p.stderr) file_stdin.close() lines = file_stdout.readlines() lines_stderr = file_stderr.readlines() exit_code = file_stdout.close() file_stderr.close() if exit_code: raise ValueError("Error while executing statement %s" % statement) if not os.path.exists(filename_tmpfile + "-gb"): os.remove(filename_tmpfile) return "", "" lines = open(filename_tmpfile + "-gb").readlines() r = Genomics.ParseFasta2Hash(lines) if not r: return "", "" os.remove(filename_tmpfile) os.remove(filename_tmpfile + "-gb") os.remove(filename_tmpfile + "-gb.htm") return r['s1'], r['s2']
def loadSequence(self, sequence, seqtype="na"): """load sequence properties from a sequence.""" SequenceProperties.loadSequence(self, sequence, seqtype) if len(sequence) % 3: raise ValueError( '''sequence length is not a multiple of 3 (length=%i)''' % (len(sequence))) # counts of amino acids self.mCountsAA = {} for x in Bio.Alphabet.IUPAC.extended_protein.letters: self.mCountsAA[x] = 0 for codon in (sequence[x:x + 3] for x in range(0, len(sequence), 3)): aa = Genomics.MapCodon2AA(codon) self.mCountsAA[aa] += 1
def filterMali(mali, method="3rd"): """build a new multiple alignment based on a filter. valid methods are 3rd: only third positions 4d: only four-fold degenerate sites """ if method not in ("3rd", "4d"): raise "unknown method %s" % method if method == "3rd": columns = range(2, mali.getWidth(), 3) elif method == "4d": # translate trans_mali = Mali.Mali() for id, seq in mali.items(): s = [] sequence = seq.mString l = len(sequence) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) s.append(aa) trans_mali.addSequence(id, 0, l, "".join(s)) # get four-fold (or higher) degenerate amino acids aa_columns = trans_mali.getColumns() columns = [] for c in range(len(aa_columns)): chars = set(aa_columns[c]) chars = chars.difference(set(mali.mGapChars)) if len(chars) == 1: char = list(chars)[0].upper() try: deg = Genomics.DegeneracyAA[char] except KeyError: continue if deg >= 4: columns.append(c * 3) mali.takeColumns(columns)
def WriteOverviewFrequencies(fields, table, options): WriteHeader(options) output = [] for x in range(1, len(fields) - 1): for y in range(x + 1, len(fields)): frequencies = {} # collect frequencies per amino acid for c in table: codon = c[0] f1 = c[x] f2 = c[y] aa = Genomics.MapCodon2AA(codon) if aa not in frequencies: frequencies[aa] = [] frequencies[aa].append((codon, f1, f2)) changed = {} # sort for both genomes, and check if preference has changed for aa, codons in frequencies.items(): codons.sort(lambda x, y: cmp(x[1], y[1])) pref_codon1 = codons[-1] codons.sort(lambda x, y: cmp(x[2], y[2])) pref_codon2 = codons[-1] if pref_codon1 == pref_codon2: continue else: changed[aa] = [(True, pref_codon1[2], pref_codon1[0]), (False, pref_codon2[1], pref_codon2[0])] output += WriteChanges(fields[x], fields[y], changed, options) WriteOutput(output, options)
def maskMali(mali, method="seg"): """mask multiple alignment according to an external masker. """ if method == "seg": masker = Masker.MaskerSeg() elif method == "bias": masker = Masker.MaskerBias() elif method == "random": masker = Masker.MaskerRandom() if mali.getAlphabet() == "na" and method in ("seg", "bias"): for id, s in mali.items(): ss = Genomics.TranslateDNA2Protein(s.mString) mss = masker(ss) columns = [] for x in range(0, len(mss)): if mss[x] in ("X", "x"): columns += range(x, x + 3) mali.getEntry(id).maskColumns(columns) else: for id, s in mali.items(): mali[id].mString = masker(s.mString)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=( "apply", "change-format", "renumber-reads", "sample", "sort", "trim3", "trim5", "unique", "reverse-complement", "grep"), help="method to apply [%default]") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores " "to format [default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option( "--sample-size", dest="sample_size", type="float", help="proportion of reads to sample. " "Provide a proportion of reads to sample, e.g. 0.1 for 10%, " "0.5 for 50%, etc [default=%default].") parser.add_option( "--pair-fastq-file", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--map-tsv-file", dest="map_tsv_file", type="string", help="filename with tab-separated identifiers mapping for " "method apply [default=%default].") parser.add_option( "--num-bases", dest="nbases", type="int", help="number of bases to trim [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--pattern-identifier", dest="renumber_pattern", type="string", help="rename reads in file by pattern [default=%default]") parser.add_option( "--grep-pattern", dest="grep_pattern", type="string", help="subset to reads matching pattern [default=%default]") parser.set_defaults( method=None, change_format=None, guess_format=None, sample_size=0.1, nbases=0, pair=None, apply=None, seed=None, renumber_pattern="read_%010i", grep_pattern=".*") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) c = E.Counter() if options.method is None: raise ValueError("no method specified, please use --method") if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "reverse-complement": for record in Fastq.iterate(options.stdin): record.seq = Genomics.complement(record.seq) record.quals = record.quals[::-1] options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError( "please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) else: for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn( "consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in zip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[ record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[ record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help="OUTPUT filename with histogram information on aggregate coverages [%default].") parser.add_option("--read-length-mean", dest="read_length_mean", type="float", help="simulation parameter [default=%default].") parser.add_option("--read-length-std", dest="read_length_stddev", type="float", help="simulation parameter [default=%default].") parser.add_option("--coverage-mean", dest="coverage_mean", type="float", help="simulation parameter [default=%default].") parser.add_option("--coverage-std", dest="coverage_stddev", type="float", help="simulation parameter [default=%default].") parser.add_option("--ds-mean", dest="ds_mean", type="float", help="simulation parameter [default=%default].") parser.add_option("--ds-std", dest="ds_stddev", type="float", help="simulation parameter [default=%default].") parser.add_option("--error-mean", dest="error_mean", type="float", help="simulation parameter [default=%default].") parser.add_option("--error-std", dest="error_stddev", type="float", help="simulation parameter [default=%default].") parser.add_option("--min-read-length", dest="min_read_length", type="int", help="minimum read length [default=%default].") parser.add_option("--sample-size", dest="sample_size", type="int", help="randomly sample from selected transcripts [default=%default].") parser.add_option("--test", dest="test", type="int", help="test with # first entries [default=%default].") parser.add_option("--mode", dest="mode", type="choice", choices=("genes", "transcripts"), help="use genes or transcripts [default=%default].") parser.set_defaults( genome_file=None, read_length_mean=200.0, read_length_stddev=20.0, coverage_mean=2.0, coverage_stddev=1.0, ds_mean=None, ds_stddev=None, error_mean=None, error_stddev=None, min_read_length=50, test=None, mode="transcripts", output_filename_pattern=None, output_format_id="%010i", sample_size=0, ) (options, args) = E.Start(parser, argv) assert options.genome_file, "please supply an indexed genome." if options.output_filename_pattern: outfile_stats = IOTools.openFile(options.output_filename_pattern % "stats", "w") outfile_stats.write( "id\tlen\tnreads\tlen_mean\tlen_std\tcov_mean\tcov_std\n") outfile_map = IOTools.openFile(options.output_filename_pattern % "map", "w") outfile_map.write("id\ttranscript\n") else: outfile_stats = None outfile_map = None genome = IndexedFasta.IndexedFasta(options.genome_file) ninput, noutput, nskipped = 0, 0, 0 total_counts, total_read_lengths, total_len = [], [], 0 total_pids = [] total_error_pids = [] if options.mode == "transcripts": iterator = GTF.transcript_iterator( GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon")) getId = lambda x: x.transcript_id elif options.mode == "genes": iterator = GTF.flat_gene_iterator( GTF.iterator_filtered(GTF.iterator(options.stdin), feature="exon")) getId = lambda x: x.gene_id if options.sample_size: iterator = Iterators.sample(iterator) if options.ds_mean: do_mutate = True pid_calc = SequencePairProperties.SequencePairPropertiesPID() else: do_mutate = False if options.error_mean: do_error = True pid_calc = SequencePairProperties.SequencePairPropertiesPID() else: do_error = False for gtfs in iterator: id = getId(gtfs[0]) try: sequence = GTF.toSequence(gtfs, genome) except KeyError as msg: if options.loglevel >= 2: options.stdlog.write("# skipping %s: %s\n" % (id, msg)) nskipped += 1 continue lsequence = len(sequence) if lsequence <= options.min_read_length * 2: if options.loglevel >= 2: options.stdlog.write( "# skipping %s - sequence is too short: %i\n" % (id, lsequence)) nskipped += 1 continue ninput += 1 if do_mutate: new_sequence = getMutatedSequence(sequence, options.ds_mean) pid_calc.loadPair(sequence, new_sequence) pid = pid_calc.mPID total_pids.append(pid) sequence = new_sequence else: pid = 100.0 if options.loglevel >= 2: options.stdlog.write( "# processing %s - len=%i\n" % (id, lsequence)) options.stdlog.flush() total_len += lsequence lvsequence = lsequence * \ random.gauss(options.coverage_mean, options.coverage_stddev) covered = 0 counts = numpy.zeros(lsequence) nreads = 0 error_pids, read_lengths = [], [] while covered < lvsequence: read_length = int( random.gauss(options.read_length_mean, options.read_length_stddev)) positive = random.randint(0, 1) if positive: start = random.randint(0, lsequence) end = min(lsequence, start + read_length) else: end = random.randint(0, lsequence) start = max(0, end - read_length) read_length = end - start if read_length < options.min_read_length: continue segment = sequence[start:end] if not positive: segment = Genomics.complement(segment) noutput += 1 if do_error: new_segment = getMutatedSequence(segment, options.error_mean) pid_calc.loadPair(segment, new_segment) pid = pid_calc.mPID error_pids.append(pid) segment = new_segment else: pid = 100.0 options.stdout.write( ">%s\n%s\n" % (options.output_format_id % noutput, segment)) if outfile_map: outfile_map.write( "%s\t%s\n" % (id, options.output_format_id % noutput)) for x in range(start, end): counts[x] += 1 nreads += 1 covered += read_length read_lengths.append(read_length) if options.loglevel >= 2: options.stdout.write("# transcript %s: len=%i, nreads=%i, len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f\n" % (id, lsequence, nreads, numpy.mean( read_lengths), numpy.std( read_lengths), numpy.mean( counts), numpy.std(counts))) if outfile_stats: outfile_stats.write("%s\t%i\t%i\t%.2f\t%.2f\t%.2f\t%.2f\n" % (id, lsequence, nreads, numpy.mean( read_lengths), numpy.std( read_lengths), numpy.mean( counts), numpy.std(counts))) total_counts += list(counts) total_read_lengths += read_lengths total_error_pids += error_pids if options.test and ninput >= options.test: break if options.sample_size and ninput >= options.sample_size: break if options.loglevel >= 1: output = ["len=%i, nreads=%i" % (total_len, noutput)] output.append("len_mean=%.2f, len_std=%.2f, cov_mean=%.2f, cov_stddev=%.2f" % ( numpy.mean(total_read_lengths), numpy.std(total_read_lengths), numpy.mean(total_counts), numpy.std(total_counts))) no_uncovered = [x for x in total_counts if x > 0] output.append("cov0_mean=%.2f, cov0_stddev=%.2f" % (numpy.mean(no_uncovered), numpy.std(no_uncovered))) if do_mutate: output.append("pid_mean=%.2f, pid_std=%.2f" % (numpy.mean(total_pids), numpy.std(total_pids))) if do_error: output.append("pid_error_mean=%.2f, pid_error_std=%.2f" % (numpy.mean(total_error_pids), numpy.std(total_error_pids))) options.stdlog.write("# effective: %s\n" % ", ".join(output)) if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def buildCompactVariantSequences(variants, sequences): '''build variant sequences by inserting ``variants`` into ``sequences``. The original frame of the sequence is maintained by converting the input sequence to a list. Each entry in the list corresponds to a position in a wild type. The wild type (WT) sequence is lower case SNP: variant (ambiguity codes for variants) homozygous insertion: upper-case bases after lower-case (WT) base heterozygous insertion: lower-case bases after lower-case (WT) base homozygous deletion: empty fields heterozygous deletion: "-" after lower-case (WT) base returns a dictionary of lists. ''' result = {} for key, sequence in sequences.iteritems(): variant_seq = list(sequence.lower()) start, end = key # get all variants that overlap with sequences for var_start, var_end, values in variants.find(start, end): reference, action, has_wildtype, variantseqs = values is_homozygous = len(variantseqs) == 1 and not has_wildtype rel_start, rel_end = var_start - start, var_end - start startoffset = max(0, start - var_start) endoffset = max(0, var_end - end) if action == "=": assert rel_start >= 0 assert sequence[rel_start].upper() == reference, \ 'reference base mismatch: expected %s, got %s at %i-%i' % \ (sequence[rel_start].upper(), reference, var_start, var_end) if is_homozygous: variant_seq[rel_start] = variantseqs[0] else: variant_seq[rel_start] = Genomics.resolveReverseAmbiguousNA( "".join(variantseqs)) elif action == "-": xstart, xend = max(0, rel_start), min(len(sequence), rel_end) for variant in variantseqs: # truncated for variants of unequal lengths (-AA/-AAA) refseq = sequence[xstart:xend].upper()[:len(variant)] assert refseq == variant[startoffset:len(variant) - endoffset], \ 'reference base mismatch at deletion: expected %s %s %s, got %s[%i:%i] at %i-%i (%i-%i), action=%s' % \ (sequence[xstart - 10:xstart], refseq, sequence[xend:xend + 10], variant, startoffset, len(variant) - endoffset, var_start, var_end, start, end, action) l = len(variant) - startoffset - endoffset if is_homozygous: variant_seq[xstart:xend] = [""] * l else: for x in range(xstart, xend): if variant_seq[x].endswith("-"): assert not has_wildtype variant_seq[x] = "" else: variant_seq[x] += "-" elif action == "+": if is_homozygous: variant_seq[rel_start] += variantseqs[0].upper() else: if has_wildtype: variant_seq[rel_start] += variantseqs[0].upper() else: # merge indels like +AAA/+AA a, b = variantseqs if a.startswith(b): variant_seq[ rel_start] += b.upper() + a[len(b):].lower() elif b.startswith(a): variant_seq[ rel_start] += a.upper() + b[len(a):].lower() else: raise ValueError( "don't know how to encode variant: %s" % variantseqs) result[(start, end)] = variant_seq return result
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-t", "--tablename", dest="tablename", type="string", help="tablename to get variants from (in samtools pileup format) [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option("-f", "--exons-file", dest="filename_exons", type="string", help="filename with transcript model information (gtf formatted file) [default=%default].") parser.add_option("-r", "--filename-reference", dest="filename_reference", type="string", help="filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default].") parser.add_option("--vcf-file", dest="filename_vcf", type="string", help="filename with variants in VCF format. Should be indexed by tabix [default=%default].") parser.add_option("--pileup-file", dest="filename_pileup", type="string", help="filename with variants in samtools pileup format. Should be indexed by tabix [default=%default].") parser.add_option("--vcf-sample", dest="vcf_sample", type="string", help="sample id for species of interest in vcf formatted file [default=%default].") parser.add_option("-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help="filename of a list of transcript ids that are selenoproteins [default=%default].") parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option("-k", "--with-knockouts", dest="with_knockouts", action="store_true", help="add alleles that are knocked out to fasta and gtf files [default=%default].") parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite( options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF( options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join( ("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print "# collected variants:", variants # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print "# merged variants:", variants # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences( indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print "exon", key Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print "intron", key Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles(transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip(allele.exon_starts[1:], allele.cds_starts[ 1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write( ">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ( "\t".join((gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-t", "--tablename", dest="tablename", type="string", help= "tablename to get variants from (in samtools pileup format) [default=%default]." ) parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option( "-f", "--exons-file", dest="filename_exons", type="string", help= "filename with transcript model information (gtf formatted file) [default=%default]." ) parser.add_option( "-r", "--filename-reference", dest="filename_reference", type="string", help= "filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default]." ) parser.add_option( "--vcf-file", dest="filename_vcf", type="string", help= "filename with variants in VCF format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--pileup-file", dest="filename_pileup", type="string", help= "filename with variants in samtools pileup format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--vcf-sample", dest="vcf_sample", type="string", help= "sample id for species of interest in vcf formatted file [default=%default]." ) parser.add_option( "-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help= "filename of a list of transcript ids that are selenoproteins [default=%default]." ) parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option( "-k", "--with-knockouts", dest="with_knockouts", action="store_true", help= "add alleles that are knocked out to fasta and gtf files [default=%default]." ) parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite(options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF(options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join(("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# collected variants:", variants) # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# merged variants:", variants) # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences(indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print("exon", key) Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print("intron", key) Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles( transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip( allele.exon_starts[1:], allele.cds_starts[1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write(">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ("\t".join( (gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/cds2codons.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-m", "--map", dest="filename_map", type="string", help="filename with mapping information.") parser.add_option("-f", "--format", dest="format", type="string", help="output file format [fasta-codons].") parser.add_option("-c", "--codons", dest="codons", action="store_true", help="print codons separated by spaces.") parser.set_defaults( filename_cds=None, codons=False, format="fasta", filename_map=None, ) (options, args) = E.Start(parser, add_pipe_options=True) if not options.filename_map: raise "please supply filename with map between peptide to cds." if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.Read(line) map_old2new[m.mToken] = m else: map_old2new = {} if options.filename_cds: sequences = Genomics.ReadPeptideSequences( open(options.filename_cds, "r")) else: sequences = Genomics.ReadPeptideSequences(sys.stdin) if options.loglevel >= 1: print "# read %i sequences" % len(sequences) sys.stdout.flush() ninput, nskipped, noutput, nerrors, nstops = 0, 0, 0, 0, 0 for key, s in sequences.items(): ninput += 1 if key not in map_old2new: nskipped += 1 continue out_seq = [] m = map_old2new[key] m.Expand() mm = m.mMapOld2New if mm.getColTo() > len(s): options.stderr.write( "# error for %s: sequence shorter than alignment: %i < %i\n" % (key, len(s), mm.getColTo())) nerrors += 1 continue for x in range(mm.getRowFrom(), mm.getRowTo() + 1): y = mm.mapRowToCol(x) if y > 0: out_seq.append(s[y - 1]) m.Clear() out_seq = "".join(out_seq) translation = Genomics.TranslateDNA2Protein(out_seq) if "X" in translation: nstops += 1 if options.codons: out_seq = " ".join( [out_seq[x:x + 3] for x in range(0, len(out_seq), 3)]) noutput += 1 options.stdout.write(">%s\n%s\n" % (key, out_seq)) options.stderr.write("# input=%i, output=%i, errors=%i, stops=%i\n" % (ninput, noutput, nerrors, nstops)) E.Stop()
def getSequence(self, contig, strand="+", start=0, end=0, converter=None, as_array=False): """get a genomic fragment. A genomic fragment is identified by the coordinates contig, strand, start, end. The converter function supplied translated these coordinates into 0-based coordinates. By default, start and end are assumed to be pythonic coordinates and are forward/reverse coordinates. If as_array is set to true, return the AString object. This might be beneficial for large sequence chunks. If as_array is set to False, return a python string. """ contig = self.getToken(contig) data = self.mIndex[contig] # dummy is # -> pos_seq for seekable streams # -> block_size for unseekable streams try: pos_id, dummy, lsequence = struct.unpack("QQi", data) except (struct.error, TypeError): pos_id, dummy, lsequence, points = data pos_seq = dummy block_size = dummy if end == 0: end = lsequence if end > lsequence: raise ValueError("3' coordinate on %s out of bounds: %i > %i" % (contig, end, lsequence)) if start < 0: raise ValueError("5' coordinate on %s out of bounds: %i < 0" % (contig, start)) if converter: first_pos, last_pos = converter(start, end, str(strand) in ("+", "1"), lsequence) elif self.mConverter: first_pos, last_pos = self.mConverter(start, end, str(strand) in ("+", "1"), lsequence) else: first_pos, last_pos = start, end if str(strand) in ("-", "0", "-1"): first_pos, last_pos = lsequence - \ last_pos, lsequence - first_pos if first_pos == last_pos: return "" assert first_pos < last_pos, \ "first position %i is larger than last position %i " % \ (first_pos, last_pos) p = AString() if self.mNoSeek: # read directly from position p.fromstring( self.mDatabaseFile.read(block_size, data[3], first_pos, last_pos)) else: first_pos += pos_seq last_pos += pos_seq self.mDatabaseFile.seek(first_pos) p.fromstring(self.mDatabaseFile.read(last_pos - first_pos)) if str(strand) in ("-", "0", "-1"): p = AString(Genomics.reverse_complement(str(p))) if self.mTranslator: return self.mTranslator.translate(p) elif as_array: return p else: return p.tostring().decode("ascii")
def buildCompactVariantSequences(variants, sequences): '''build variant sequences by inserting ``variants`` into ``sequences``. The original frame of the sequence is maintained by converting the input sequence to a list. Each entry in the list corresponds to a position in a wild type. The wild type (WT) sequence is lower case SNP: variant (ambiguity codes for variants) homozygous insertion: upper-case bases after lower-case (WT) base heterozygous insertion: lower-case bases after lower-case (WT) base homozygous deletion: empty fields heterozygous deletion: "-" after lower-case (WT) base returns a dictionary of lists. ''' result = {} for key, sequence in sequences.items(): variant_seq = list(sequence.lower()) start, end = key # get all variants that overlap with sequences for var_start, var_end, values in variants.find(start, end): reference, action, has_wildtype, variantseqs = values is_homozygous = len(variantseqs) == 1 and not has_wildtype rel_start, rel_end = var_start - start, var_end - start startoffset = max(0, start - var_start) endoffset = max(0, var_end - end) if action == "=": assert rel_start >= 0 assert sequence[rel_start].upper() == reference, \ 'reference base mismatch: expected %s, got %s at %i-%i' % \ (sequence[rel_start].upper(), reference, var_start, var_end) if is_homozygous: variant_seq[rel_start] = variantseqs[0] else: variant_seq[ rel_start] = Genomics.resolveReverseAmbiguousNA( "".join(variantseqs)) elif action == "-": xstart, xend = max(0, rel_start), min(len(sequence), rel_end) for variant in variantseqs: # truncated for variants of unequal lengths (-AA/-AAA) refseq = sequence[xstart:xend].upper()[:len(variant)] assert refseq == variant[startoffset:len(variant) - endoffset], \ 'reference base mismatch at deletion: expected %s %s %s, got %s[%i:%i] at %i-%i (%i-%i), action=%s' % \ (sequence[xstart - 10:xstart], refseq, sequence[xend:xend + 10], variant, startoffset, len(variant) - endoffset, var_start, var_end, start, end, action) l = len(variant) - startoffset - endoffset if is_homozygous: variant_seq[xstart:xend] = [""] * l else: for x in range(xstart, xend): if variant_seq[x].endswith("-"): assert not has_wildtype variant_seq[x] = "" else: variant_seq[x] += "-" elif action == "+": if is_homozygous: variant_seq[rel_start] += variantseqs[0].upper() else: if has_wildtype: variant_seq[rel_start] += variantseqs[0].upper() else: # merge indels like +AAA/+AA a, b = variantseqs if a.startswith(b): variant_seq[rel_start] += b.upper( ) + a[len(b):].lower() elif b.startswith(a): variant_seq[rel_start] += a.upper( ) + b[len(a):].lower() else: raise ValueError( "don't know how to encode variant: %s" % variantseqs) result[(start, end)] = variant_seq return result
def ProcessResult(result, options, mali=None, prefix=None, p_value=None): counts = None if options.method == "summary-slr": thresholds = "95%", "99%", "95% corrected", "99% corrected" if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write("%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t" % ( result.mTreeLength, result.mOmega, result.mKappa, result.mLogLikelihood, len(result.mSites), result.mNSitesSynonymous, result.mNSitesGaps + result.mNSitesSingleChar, )) options.stdout.write("\t".join( map(lambda x: "%i" % result.mNPositiveSites[x][0], thresholds))) options.stdout.write("\t") options.stdout.write("\t".join( map(lambda x: "%i" % result.mNNegativeSites[x], thresholds))) options.stdout.write("\n") elif options.method in ("summary-filtered", "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list"): mali_length = mali.getLength() mali_width = mali.getWidth() column_data = map( lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."), mali.getColumns()) # sanity check: do lengths of mali and # of sites correspond if len(result.mSites) * 3 != mali_width: raise "mali (%i) and # of sites (%i) do not correspond." % ( mali_width, len(result.mSites)) if options.method == "summary-filtered": # count sites, but filter with multiple alignment ntotal = 0 npositive = 0 nnegative = 0 nneutral = 0 nfiltered = 0 nsynonymous = 0 if prefix: options.stdout.write("%s\t" % prefix) for x in range(len(result.mSites)): site = result.mSites[x] column = column_data[x * 3] if column.mNChars != mali_length: nfiltered += 1 continue if site.isPositive(options.significance_threshold, options.use_adjusted): npositive += 1 elif site.isNegative(options.significance_threshold, options.use_adjusted): nnegative += 1 if site.isSynonymous(): nsynonymous += 1 ntotal += 1 options.stdout.write( "%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t%i\t%i\t%i\n" % (result.mTreeLength, result.mOmega, result.mKappa, result.mLogLikelihood, len(result.mSites), nfiltered, ntotal, nsynonymous, nnegative, npositive)) counts = Result(nfiltered, ntotal, nsynonymous, nnegative, npositive) elif options.method in ( "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list", ): select_positive_sites = options.method in ("positive-site-table", "positive-site-list") select_negative_sites = options.method in ("negative-site-table", "negative-site-list") # iterate over sites and output those under xxx selection identifiers = mali.getIdentifiers() chars_per_row = [[] for x in range(mali_length)] sites = [] for col in range(len(result.mSites)): site = result.mSites[col] column = column_data[col * 3] if column.mNChars != mali_length: continue keep = False if select_positive_sites and site.isPositive( options.significance_threshold, options.use_adjusted): keep = True elif select_negative_sites and site.isNegative( options.significance_threshold, options.use_adjusted): keep = True if not keep: continue sites.append((col, site)) nsites = len(sites) if options.truncate_sites_list: # truncate sites list, sort by significance sites.sort(lambda x, y: cmp(x[1].mPValue, y[1].mPValue)) sites = sites[:options.truncate_sites_list] for col, site in sites: site = result.mSites[col] xcol = col * 3 for row in range(mali_length): id = identifiers[row] x = max(xcol - options.context_size * 3, 0) y = min(xcol + 3 + options.context_size * 3, mali_width) segment = mali[id][x:y] codon = mali[id][xcol:xcol + 3] pos = mali.getResidueNumber(id, xcol) pos /= 3 # save as real-world coordinates chars_per_row[row].append( PositionInformation( Genomics.MapCodon2AA(codon), pos + 1, xcol, Genomics.TranslateDNA2Protein(segment).upper())) if p_value is not None: pp_value = p_value else: pp_value = "na" if options.method in ("positive-site-table", "negative-site-table", "neutral-site-table"): if options.context_size: for row in range(mali_length): if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write( "%s\t%i\t%s\t%s\n" % (identifiers[row], nsites, pp_value, ";".join([ "%s%i in %s" % (x.mAA, x.mSequencePosition, x.mContext) for x in chars_per_row[row] ]))) else: for row in range(mali_length): if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write( "%s\t%i\t%s\t%s\n" % (identifiers[row], nsites, pp_value, ";".join([ "%s%i" % (x.mAA, x.mSequencePosition) for x in chars_per_row[row] ]))) elif options.method in ("positive-site-list", "negative-site-list", "neutral-site-list"): for row in range(mali_length): if prefix: xprefix = "%s\t%s" % (prefix, identifiers[row]) else: xprefix = "%s" % (identifiers[row]) x = 0 for chars in chars_per_row[row]: x += 1 options.stdout.write( "%s\t%i\t%s\t%i\t%i\t%s\n" % (xprefix, x, chars.mAA, chars.mSequencePosition, chars.mMaliPosition, chars.mContext)) options.stdout.flush() return counts
def updateProperties(self): SequencePropertiesCodons.updateProperties(self) self.mCodonFrequencies = Genomics.CalculateCodonFrequenciesFromCounts( self.mCodonCounts)
def updateVariants(variants, lcontig, strand, phased=True): '''update variants such that they use same coordinate system (and strand) as the transcript fixes 1-ness of variants ''' new_variants = [] is_positive = Genomics.IsPositiveStrand(strand) for variant in variants: pos = variant.pos genotype = bytes(variant.genotype) reference = bytes(variant.reference) # fix 1-ness of variants # pos -= 1 if len(genotype) == 1: variantseqs = list(Genomics.decodeGenotype(genotype)) has_wildtype = reference in variantseqs action = "=" start, end = pos, pos + 1 else: variantseqs = [x[1:] for x in genotype.split("/")] lvariant = max([len(x) for x in variantseqs]) if not phased: variantseqs = [x for x in variantseqs if x] has_wildtype = "*" in genotype if "+" in genotype and "-" in genotype: # both insertion and deletion at position # the range is given by the deletion # see below for explanations if genotype.startswith("+"): action = ">" variantseqs[1] += "-" * (lvariant - len(variantseqs[1])) else: action = "<" variantseqs[0] += "-" * (lvariant - len(variantseqs[0])) start, end = pos + 1, pos + lvariant + 1 elif "-" in genotype: action = "-" # samtools: deletions are after the base denoted by snp.position # * <- deletion at 1 # 0 1 2 3 4 5 6 # - - # 6 5 4 3 2 1 0 # deletion of 2+3 = (2,4) # on reverse: (7-4, 7-2) = (3,5) start, end = pos + 1, pos + lvariant + 1 # deletions of unequal length are filled up with "-" # This is necessary to deal with negative strands: # -at/-atg on the positive strand deletes a t [g] # -at/-atg on the negative strand deletes [g] t a variantseqs = [x + "-" * (lvariant - len(x)) for x in variantseqs] elif "+" in genotype: action = "+" # indels are after the base denoted by position # as region use both flanking base so that negative strand # coordinates work # insertion between position 2 and 3 # * <- insection at pos 2 # 0 1 2i3 4 # 4 3 2i1 0 # is insertion between 1 and 2 in reverse # including both flanking residues makes it work: # (2,3) = (5-3,5-2) = (2,3) # but: # (2,4) = (5-4,5-2) = (1,3) start, end = pos, pos + 2 # revert strand if not is_positive: reference = Genomics.complement(reference) variantseqs = [Genomics.complement(x.upper()) for x in variantseqs] start, end = lcontig - end, lcontig - start new_variants.append(ExtendedVariant._make(( start, end, reference.upper(), action, has_wildtype, variantseqs))) return new_variants
def buildSequenceVariants(self, seq, strand, pos, snp): '''build new sequence by modifying a sequence fragment in seq at pos with snp. It is assumed that seq is already oriented according to strand. The strand is used to revert the snp if necessary. Note that only sequences different from seq will be returned. returns is_homozygous, seqs ''' is_negative_strand = Genomics.IsNegativeStrand(strand) reference_base = snp.reference_base if reference_base != "*" and is_negative_strand: reference_base = Genomics.complement(reference_base) new_sequences = [] is_homozygous = True if reference_base != "*": if seq[pos].upper() != reference_base.upper(): raise ValueError("base mismatch at snp %i, expected %s, got %s in %s at position %i; snp=%s" % (snp.pos, reference_base, seq[pos], seq, pos, ";".join(map(str, snp)))) # single base changes variant_bases = Genomics.resolveAmbiguousNA(snp.genotype) if len(variant_bases) == 1: is_homozygous = True else: is_homozygous = False for variant_base in variant_bases: if is_negative_strand: variant_base = Genomics.complement(variant_base) s = list(seq) s[pos] = variant_base s = "".join(s) if s != seq: new_sequences.append(s) else: variants = snp.genotype.split("/") is_homozygous = False for variant in variants: s = list(seq) # samtools denotes insert/deletion after position # while python is before/at position, hence the pos+1 if variant[0] == "+": toinsert = variant[1:].upper() if is_negative_strand: toinsert = Genomics.complement(toinsert) s.insert(pos, toinsert) else: s.insert(pos + 1, toinsert) elif variant[0] == "-": # pos+1+len(x)-1 = pos+len(x) todelete = variant[1:].upper() l = len(todelete) if is_negative_strand: # delete left of pos xstart = max(0, pos - l) xend = pos todelete = todelete[:min(l, pos)] else: # delete right of pos xstart = pos + 1 xend = min(self.mSize, pos + 1 + l) todelete = todelete[:self.mSize - (pos + 1)] deleted = "".join(s[xstart:xend]) if is_negative_strand: deleted = Genomics.complement(deleted) if deleted != todelete: raise ValueError("base mismatch at indel %i, expected %s, got %s in %s at position %i(%i:%i); is_negative_strand=%s, snp=%s" % (snp.pos, todelete, deleted, seq, pos, xstart, xend, is_negative_strand, ";".join(map(str, snp)))) del s[xstart:xend] elif variant[0] == "*": is_homozygous = True else: raise ValueError("unknown variant sign '%s'" % variant[0]) s = "".join(s) if s != seq: new_sequences.append(s) return is_homozygous, new_sequences
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "--merge-exons-distance", dest="merge_exons_distance", type="int", help="distance in nucleotides between " "exons to be merged [%default].", ) parser.add_option( "--pattern-identifier", dest="pattern", type="string", help="pattern to use for renaming genes/transcripts. " "The pattern should contain a %i, for example " "--pattern-identifier=ENSG%010i [%default].", ) parser.add_option( "--sort-order", dest="sort_order", type="choice", choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"), help="sort input data [%default].", ) parser.add_option( "-u", "--with-utr", dest="with_utr", action="store_true", help="include utr in merged transcripts " "[%default].", ) parser.add_option( "--filter-method", dest="filter_method", type="choice", choices=( "gene", "transcript", "longest-gene", "longest-transcript", "representative-transcript", "proteincoding", "lincrna", ), help="Filter method to apply. Available filters are: " "'gene': filter by gene_id given in ``--map-tsv-file``, " "'transcript': filter by transcript_id given in ``--map-tsv-file``, " "'longest-gene': output the longest gene for overlapping genes ," "'longest-transcript': output the longest transcript per gene," "'representative-transcript': output the representative transcript " "per gene. The representative transcript is the transcript " "that shares most exons with other transcripts in a gene. " "The input needs to be sorted by gene. " "'proteincoding': only output protein coding features. " "'lincrna': only output lincRNA features. " "[%default].", ) parser.add_option( "-a", "--map-tsv-file", dest="filename_filter", type="string", metavar="tsv", help="filename of ids to map/filter [%default].", ) parser.add_option( "--gff-file", dest="filename_gff", type="string", metavar="GFF", help="second filename of features (see --remove-overlapping) " "[%default]", ) parser.add_option( "--invert-filter", dest="invert_filter", action="store_true", help="when using --filter, invert selection " "(like grep -v). " "[%default].", ) parser.add_option( "--sample-size", dest="sample_size", type="int", help="extract a random sample of size # if the option " "'--method=filter --filter-method' is set " "[%default].", ) parser.add_option( "--intron-min-length", dest="intron_min_length", type="int", help="minimum length for introns (for --exons-file2introns) " "[%default].", ) parser.add_option( "--min-exons-length", dest="min_exons_length", type="int", help="minimum length for gene (sum of exons) " "(--sam-fileple-size) [%default].", ) parser.add_option( "--intron-border", dest="intron_border", type="int", help="number of residues to exclude at intron at either end " "(--exons-file2introns) [%default].", ) parser.add_option( "--ignore-strand", dest="ignore_strand", action="store_true", help="remove strandedness of features (set to '.') when " "using ``transcripts2genes`` or ``filter``" "[%default].", ) parser.add_option( "--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[%default]" ) parser.add_option( "--duplicate-feature", dest="duplicate_feature", type="choice", choices=("gene", "transcript", "both", "ucsc", "coordinates"), help="remove duplicates by gene/transcript. " "If ``ucsc`` is chosen, transcripts ending on _dup# are " "removed. This is necessary to remove duplicate entries " "that are next to each other in the sort order " "[%default]", ) parser.add_option( "-m", "--method", dest="method", type="choice", action="append", choices=( "add-protein-id", "exons2introns", "filter", "find-retained-introns", "genes-to-unique-chunks", "intersect-transcripts", "join-exons", "merge-exons", "merge-transcripts", "merge-genes", "merge-introns", "remove-overlapping", "remove-duplicates", "rename-genes", "rename-transcripts", "rename-duplicates", "renumber-genes", "renumber-transcripts", "set-transcript-to-gene", "set-gene-to-transcript", "set-protein-to-transcript", "set-score-to-distance", "set-gene_biotype-to-source", "sort", "transcript2genes", "unset-genes", ), help="Method to apply [%default]." "Please only select one.", ) parser.set_defaults( sort_order="gene", filter_method="gene", pattern="%i", merge_exons_distance=0, filename_filter=None, intron_border=None, intron_min_length=None, sample_size=0, min_exons_length=0, ignore_strand=False, with_utr=False, invert_filter=False, duplicate_feature=None, strict=True, method=None, ) (options, args) = E.Start(parser, argv=argv) ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0 if options.method is None: raise ValueError("please specify a --method") if len(options.method) > 1: raise ValueError("multiple --method arguements specified") else: options.method = options.method[0] if options.method == "set-transcript-to-gene": for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("transcript_id", gff.gene_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.method == "set-gene_biotype-to-source": for gff in GTF.iterator(options.stdin): ninput += 1 if "gene_biotype" not in gff: gff.setAttribute("gene_biotype", gff.source) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.method == "remove-duplicates": counts = collections.defaultdict(int) if options.duplicate_feature == "ucsc": store = [] remove = set() f = lambda x: x[0].transcript_id gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) outf = lambda x: "\n".join([str(y) for y in x]) for entry in gffs: ninput += 1 store.append(entry) id = f(entry) if "_dup" in id: remove.add(re.sub("_dup\d+", "", id)) remove.add(id) for entry in store: id = f(entry) if id not in remove: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s" % (id)) else: if options.duplicate_feature == "gene": gffs = GTF.gene_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0][0].gene_id outf = lambda x: "\n".join(["\n".join([str(y) for y in xx]) for xx in x]) elif options.duplicate_feature == "transcript": gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0].transcript_id outf = lambda x: "\n".join([str(y) for y in x]) elif options.duplicate_feature == "coordinates": gffs = GTF.chunk_iterator(GTF.iterator(options.stdin)) f = lambda x: x[0].contig + "_" + str(x[0].start) + "-" + str(x[0].end) outf = lambda x: "\n".join([str(y) for y in x]) store = [] for entry in gffs: ninput += 1 store.append(entry) id = f(entry) counts[id] += 1 # Assumes GTF file sorted by contig then start last_id = "" if options.duplicate_feature == "coordinates": for entry in store: id = f(entry) if id == last_id: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) else: options.stdout.write(outf(entry) + "\n") noutput += 1 last_id = id else: for entry in store: id = f(entry) if counts[id] == 1: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) elif "sort" == options.method: for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort_order): ninput += 1 options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif "set-gene-to-transcript" == options.method: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("gene_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif "set-protein-to-transcript" == options.method: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("protein_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif "add-protein-id" == options.method: transcript2protein = IOTools.readMap(IOTools.openFile(options.filename_filter, "r")) missing = set() for gff in GTF.iterator(options.stdin): ninput += 1 if gff.transcript_id not in transcript2protein: if gff.transcript_id not in missing: E.debug(("removing transcript '%s' due to " "missing protein id") % gff.transcript_id) missing.add(gff.transcript_id) ndiscarded += 1 continue gff.setAttribute("protein_id", transcript2protein[gff.transcript_id]) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 E.info("transcripts removed due to missing protein ids: %i" % len(missing)) elif "join-exons" == options.method: for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(exons[0].strand) contig = exons[0].contig transid = exons[0].transcript_id geneid = exons[0].gene_id biotype = exons[0].source all_start, all_end = min([x.start for x in exons]), max([x.end for x in exons]) y = GTF.Entry() y.contig = contig y.source = biotype y.feature = "transcript" y.start = all_start y.end = all_end y.strand = strand y.transcript_id = transid y.gene_id = geneid options.stdout.write("%s\n" % str(y)) elif "merge-genes" == options.method: # merges overlapping genes # gffs = GTF.iterator_sorted_chunks( GTF.flat_gene_iterator(GTF.iterator(options.stdin)), sort_by="contig-strand-start" ) def iterate_chunks(gff_chunks): last = gff_chunks.next() to_join = [last] for gffs in gff_chunks: d = gffs[0].start - last[-1].end if gffs[0].contig == last[0].contig and gffs[0].strand == last[0].strand: assert gffs[0].start >= last[0].start, ( "input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n" ) % (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs])) if gffs[0].contig != last[0].contig or gffs[0].strand != last[0].strand or d > 0: yield to_join to_join = [] last = gffs to_join.append(gffs) yield to_join raise StopIteration for chunks in iterate_chunks(gffs): ninput += 1 if len(chunks) > 1: gene_id = "merged_%s" % chunks[0][0].gene_id transcript_id = "merged_%s" % chunks[0][0].transcript_id info = ",".join([x[0].gene_id for x in chunks]) else: gene_id = chunks[0][0].gene_id transcript_id = chunks[0][0].transcript_id info = None intervals = [] for c in chunks: intervals += [(x.start, x.end) for x in c] intervals = Intervals.combine(intervals) # take single strand strand = chunks[0][0].strand for start, end in intervals: y = GTF.Entry() y.fromGTF(chunks[0][0], gene_id, transcript_id) y.start = start y.end = end y.strand = strand if info: y.addAttribute("merged", info) options.stdout.write("%s\n" % str(y)) nfeatures += 1 noutput += 1 elif options.method == "renumber-genes": map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 if gtf.gene_id not in map_old2new: map_old2new[gtf.gene_id] = options.pattern % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[gtf.gene_id]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.method == "unset-genes": map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = gtf.transcript_id if key not in map_old2new: map_old2new[key] = options.pattern % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.method == "renumber-transcripts": map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = (gtf.gene_id, gtf.transcript_id) if key not in map_old2new: map_old2new[key] = options.pattern % (len(map_old2new) + 1) gtf.setAttribute("transcript_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.method == "transcripts2genes": transcripts = set() genes = set() ignore_strand = options.ignore_strand for gtfs in GTF.iterator_transcripts2genes(GTF.iterator(options.stdin)): ninput += 1 for gtf in gtfs: if ignore_strand: gtf.strand = "." options.stdout.write("%s\n" % str(gtf)) transcripts.add(gtf.transcript_id) genes.add(gtf.gene_id) nfeatures += 1 noutput += 1 E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes))) elif options.method in ("rename-genes", "rename-transcripts"): map_old2new = IOTools.readMap(IOTools.openFile(options.filename_filter, "r")) if options.method == "rename-transcripts": is_gene_id = False elif options.method == "rename-genes": is_gene_id = True for gff in GTF.iterator(options.stdin): ninput += 1 if is_gene_id: if gff.gene_id in map_old2new: gff.setAttribute("gene_id", map_old2new[gff.gene_id]) else: E.debug("removing missing gene_id %s" % gff.gene_id) ndiscarded += 1 continue else: if gff.transcript_id in map_old2new: gff.setAttribute("transcript_id", map_old2new[gff.transcript_id]) else: E.debug("removing missing transcript_id %s" % gff.transcript_id) ndiscarded += 1 continue noutput += 1 options.stdout.write("%s\n" % str(gff)) elif options.method == "filter": keep_genes = set() if options.filter_method == "longest-gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) coords = [] gffs = [] for gff in iterator: gff.sort(key=lambda x: x.start) coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id)) gffs.append(gff) coords.sort() last_contig = None max_end = 0 longest_gene_id = None longest_length = None for contig, start, end, gene_id in coords: ninput += 1 if contig != last_contig or start >= max_end: if longest_gene_id: keep_genes.add(longest_gene_id) longest_gene_id = gene_id longest_length = end - start max_end = end else: if end - start > longest_length: longest_length, longest_gene_id = end - start, gene_id last_contig = contig max_end = max(max_end, end) keep_genes.add(longest_gene_id) invert = options.invert_filter for gff in gffs: keep = gff[0].gene_id in keep_genes if (keep and not invert) or (not keep and invert): noutput += 1 for g in gff: nfeatures += 1 options.stdout.write("%s\n" % g) else: ndiscarded += 1 elif options.filter_method in ("longest-transcript", "representative-transcript"): iterator = GTF.gene_iterator(GTF.iterator(options.stdin)) def selectLongestTranscript(gene): r = [] for transcript in gene: transcript.sort(key=lambda x: x.start) length = transcript[-1].end - transcript[0].start r.append((length, transcript)) r.sort() return r[-1][1] def selectRepresentativeTranscript(gene): """select a representative transcript. The representative transcript represent the largest number of exons over all transcripts. """ all_exons = [] for transcript in gene: all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"]) exon_counts = {} for key, exons in itertools.groupby(all_exons): exon_counts[key] = len(list(exons)) transcript_counts = [] for transcript in gene: count = sum([exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon"]) # add transcript id to sort to provide a stable # segmentation. transcript_counts.append((count, transcript[0].transcript_id, transcript)) transcript_counts.sort() return transcript_counts[-1][-1] if options.filter_method == "longest-transcript": _select = selectLongestTranscript elif options.filter_method == "representative-transcript": _select = selectRepresentativeTranscript for gene in iterator: ninput += 1 # sort in order to make reproducible which # gene is chosen. transcript = _select(sorted(gene)) noutput += 1 for g in transcript: nfeatures += 1 options.stdout.write("%s\n" % g) elif options.filter_method in ("gene", "transcript"): if options.filename_filter: ids, nerrors = IOTools.ReadList(IOTools.openFile(options.filename_filter, "r")) E.info("read %i ids" % len(ids)) ids = set(ids) by_gene = options.filter_method == "gene" by_transcript = options.filter_method == "transcript" invert = options.invert_filter ignore_strand = options.ignore_strand for gff in GTF.iterator(options.stdin): ninput += 1 keep = False if by_gene: keep = gff.gene_id in ids if by_transcript: keep = gff.transcript_id in ids if (invert and keep) or (not invert and not keep): continue if ignore_strand: gff.strand = "." options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.sample_size: if options.filter_method == "gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) elif options.filter_method == "transcript": iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) if options.min_exons_length: iterator = GTF.iterator_min_feature_length( iterator, min_length=options.min_exons_length, feature="exon" ) data = [x for x in iterator] ninput = len(data) if len(data) > options.sample_size: data = random.sample(data, options.sample_size) for d in data: noutput += 1 for dd in d: nfeatures += 1 options.stdout.write(str(dd) + "\n") else: assert False, "please supply either a filename " "with ids to filter with (--map-tsv-file) or a sample-size." elif options.filter_method in ("proteincoding", "lincrna", "processed-pseudogene"): # extract entries by transcript/gene biotype. # This filter uses a test on the source field (ENSEMBL pre v78) # a regular expression on the attributes (ENSEMBL >= v78). tag = { "proteincoding": "protein_coding", "processed-pseudogene": "processed_pseudogene", "lincrna": "lincRNA", }[options.filter_method] rx = re.compile('"%s"' % tag) if not options.invert_filter: f = lambda x: x.source == tag or rx.search(x.attributes) else: f = lambda x: x.source != tag and not rx.search(x.attributes) for gff in GTF.iterator(options.stdin): ninput += 1 if f(gff): options.stdout.write(str(gff) + "\n") noutput += 1 else: ndiscarded += 1 elif options.method == "exons2introns": for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") input_ranges = Intervals.combine(cds_ranges + exon_ranges) if len(input_ranges) > 1: last = input_ranges[0][1] output_ranges = [] for start, end in input_ranges[1:]: output_ranges.append((last, start)) last = end if options.intron_border: b = options.intron_border output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges] if options.intron_min_length: l = options.intron_min_length output_ranges = [x for x in output_ranges if x[1] - x[0] > l] for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "intron" entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 else: ndiscarded += 1 elif options.method == "set-score-to-distance": for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(gffs[0].strand) all_start, all_end = min([x.start for x in gffs]), max([x.end for x in gffs]) if strand != ".": t = 0 if strand == "-": gffs.reverse() for gff in gffs: gff.score = t t += gff.end - gff.start if strand == "-": gffs.reverse() for gff in gffs: options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.method == "remove-overlapping": index = GTF.readAndIndex(GTF.iterator(IOTools.openFile(options.filename_gff, "r"))) for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 found = False for e in gffs: if index.contains(e.contig, e.start, e.end): found = True break if found: ndiscarded += 1 else: noutput += 1 for e in gffs: nfeatures += 1 options.stdout.write("%s\n" % str(e)) elif options.method == "intersect-transcripts": for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 r = [] for g in gffs: if options.with_utr: ranges = GTF.asRanges(g, "exon") else: ranges = GTF.asRanges(g, "CDS") r.append(ranges) result = r[0] for x in r[1:]: result = Intervals.intersect(result, x) entry = GTF.Entry() entry.copy(gffs[0][0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "exon" for start, end in result: entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 elif "rename-duplicates" == options.method: # note: this will only rename entries with "CDS" in feature column assert options.duplicate_feature in ["gene", "transcript", "both"], ( "for renaming duplicates, --duplicate-feature must be set to one " "of 'gene', transcript' or 'both'" ) gene_ids = list() transcript_ids = list() gtfs = list() for gtf in GTF.iterator(options.stdin): gtfs.append(gtf) if gtf.feature == "CDS": gene_ids.append(gtf.gene_id) transcript_ids.append(gtf.transcript_id) dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1] dup_transcript = [item for item in set(transcript_ids) if transcript_ids.count(item) > 1] E.info("Number of duplicated gene_ids: %i" % len(dup_gene)) E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript)) gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene)))) transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript)))) for gtf in gtfs: if gtf.feature == "CDS": if options.duplicate_feature in ["both", "gene"]: if gtf.gene_id in dup_gene: gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1 gtf.setAttribute("gene_id", gtf.gene_id + "." + str(gene_dict[gtf.gene_id])) if options.duplicate_feature in ["both", "transcript"]: if gtf.transcript_id in dup_transcript: transcript_dict[gtf.transcript_id] = transcript_dict[gtf.transcript_id] + 1 gtf.setAttribute( "transcript_id", gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id]) ) options.stdout.write("%s\n" % gtf) elif options.method in ("merge-exons", "merge-introns", "merge-transcripts"): for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") # sanity checks strands = set([x.strand for x in gffs]) contigs = set([x.contig for x in gffs]) if len(strands) > 1: raise ValueError("can not merge gene '%s' on multiple strands: %s" % (gffs[0].gene_id, str(strands))) if len(contigs) > 1: raise ValueError("can not merge gene '%s' on multiple contigs: %s" % (gffs[0].gene_id, str(contigs))) strand = Genomics.convertStrand(gffs[0].strand) if cds_ranges and options.with_utr: cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1] midpoint = (cds_end - cds_start) / 2 + cds_start utr_ranges = [] for start, end in Intervals.truncate(exon_ranges, cds_ranges): if end - start > 3: if strand == ".": feature = "UTR" elif strand == "+": if start < midpoint: feature = "UTR5" else: feature = "UTR3" elif strand == "-": if start < midpoint: feature = "UTR3" else: feature = "UTR5" utr_ranges.append((feature, start, end)) output_feature = "CDS" output_ranges = cds_ranges else: output_feature = "exon" output_ranges = exon_ranges utr_ranges = [] result = [] try: biotypes = [x["gene_biotype"] for x in gffs] biotype = ":".join(set(biotypes)) except (KeyError, AttributeError): biotype = None if options.method == "merge-exons": # need to combine per feature - skip # utr_ranges = Intervals.combineAtDistance( # utr_ranges, # options.merge_exons_distance) output_ranges = Intervals.combineAtDistance(output_ranges, options.merge_exons_distance) for feature, start, end in utr_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.feature = feature entry.transcript_id = "merged" if biotype: entry.addAttribute("gene_biotype", biotype) entry.start = start entry.end = end result.append(entry) for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" if biotype: entry.addAttribute("gene_biotype", biotype) entry.feature = output_feature entry.start = start entry.end = end result.append(entry) elif options.method == "merge-transcripts": entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id if biotype: entry.addAttribute("gene_biotype", biotype) entry.start = output_ranges[0][0] entry.end = output_ranges[-1][1] result.append(entry) elif options.method == "merge-introns": if len(output_ranges) >= 2: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id if biotype: entry.addAttribute("gene_biotype", biotype) entry.start = output_ranges[0][1] entry.end = output_ranges[-1][0] result.append(entry) else: ndiscarded += 1 continue result.sort(key=lambda x: x.start) for x in result: options.stdout.write("%s\n" % str(x)) nfeatures += 1 noutput += 1 elif options.method == "find-retained-introns": for gene in GTF.gene_iterator(GTF.iterator(options.stdin)): ninput += 1 found_any = False for intron in find_retained_introns(gene): found_any = True options.stdout.write("%s\n" % str(intron)) nfeatures += 1 if found_any: noutput += 1 elif options.method == "genes-to-unique-chunks": for gene in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 for exon in gene_to_blocks(gene): options.stdout.write("%s\n" % str(exon)) nfeatures += 1 noutput += 1 else: raise ValueError("unknown method '%s'" % options.method) E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded)) E.Stop()
def _alignToProfile( infile, outfile, min_score = 0 ): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile( open("../data/mouse.fasta") ) src_mali = Mali.convertMali2Alignlib( mali ) E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() )) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0,2): profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n ) profile_mali = Mali.convertMali2Alignlib( profile_mali ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() ) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile( profile_mali ) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull( alignment_mode, -5.0, -0.5 ) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal( 0, n, 0 ) build_mali.add( src_mali, m ) outf = open( outfile, "w" ) outf_log = open( outfile + ".info", "w" ) outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append( re.sub( "-", "", mali[pid] ) ) ids.append( pid ) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator( open(infile)): E.debug("adding %s" % s.title ) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence( s.sequence ) rseq = alignlib.makeSequence( rsequence ) alignator.align( map_seq2profile, seq, profile ) alignator.align( map_rseq2profile, rseq, profile ) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts( m ) covered = 0 for mm in r: build_mali.add( mm ) sequences.append( sequence ) ids.append( s.title ) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write( "\t".join( map(str, ( s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence) ), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns()) ) ) ) + "\n" ) c.output += 1 #build_mali.expand( aa ) result = str(alignlib.MultAlignmentFormatPlain( build_mali, sequences, alignlib.UnalignedStacked )) for pid, data in zip(ids, result.split("\n") ): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) ) outf.close() outf_log.close() E.info( "%s\n" % str(c) )
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: shuffle_fasta.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-c", "--codons", dest="codons", action="store_true", help="make sure that shuffled sequences only contain valid codons.") parser.add_option("-a", "--conserve-aminos", dest="conserve_aminos", action="store_true", help="conserve amino acids.") parser.add_option( "-b", "--bias", dest="bias", type="float", help= "introduce bias into codon usage choice. Complete bias is 1.0, while no bias is 0.0." ) parser.add_option( "-i", "--biased-codon-usage", dest="filename_biased_codon_usage", type="string", help="Filename with reference codon usage table for biased codon usage." ) parser.add_option( "-u", "--bulk-codon-usage", dest="filename_bulk_codon_usage", type="string", help= "Filename with reference codon usage table for unbiased codon usage.") parser.set_defaults( codons=False, conserve_aminos=False, bias=0.0, filename_biased_codon_usage=None, filename_bulk_codon_usage=None, stop_codons=("TAG", "TAA", "TGA"), precision=10000, ) (options, args) = E.Start(parser, add_pipe_options=True) iterator = FastaIterator.FastaIterator(sys.stdin) # get map of amino acids to codons map_aa2codons = Genomics.GetMapAA2Codons() # for codon based shuffling: build ranges based on strength of bias and on reference codon usage # Bias switches from completely biased to unbiased. Unbiased is uniform # usage. if options.filename_biased_codon_usage: map_codon2frequency = IOTools.ReadMap(open( options.filename_biased_codon_usage, "r"), map_functions=(str, float), has_header=True) if options.filename_bulk_codon_usage: map_codon2frequency_bulk = IOTools.ReadMap( open(options.filename_bulk_codon_usage, "r"), map_functions=(str, float), has_header=True) codon_ranges = {} for aa in map_aa2codons.keys(): c = [] x = 0 for codon in map_aa2codons[aa]: if options.filename_bulk_codon_usage: u = map_codon2frequency_bulk[codon] else: # uniform usage u = 1.0 / len(map_aa2codons[aa]) g = map_codon2frequency[codon] f = g + (u - g) * (1.0 - options.bias) x += f * options.precision c.append(x) codon_ranges[aa] = c while 1: cur_record = iterator.next() if cur_record is None: break sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) if options.conserve_aminos: n = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) if aa not in map_aa2codons: continue if options.bias or options.filename_biased_codon_usage: # get random number from 0 to precision v = random.randint(0, options.precision) # find the corresponding intervall: l = len(map_aa2codons[aa]) x = 0 while x < l - 1: if v < codon_ranges[aa][x]: break x += 1 else: x = random.randint(0, len(map_aa2codons[aa]) - 1) n.append(map_aa2codons[aa][x]) sequence = "".join(n) else: sequence = list(sequence) if options.codons: while 1: random.shuffle(sequence) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if codon in options.stop_codons: redo = True break else: break else: random.shuffle(sequence) sequence = "".join(sequence) options.stdout.write(">%s\n%s\n" % (cur_record.title, "".join(sequence))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf instead of gff.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-m", "--merge-adjacent", dest="merge", action="store_true", help="merge adjacent intervals with the same attributes." " [default=%default]") parser.add_option("-e", "--feature", dest="feature", type="string", help="filter by a feature, for example 'exon', 'CDS'." " If set to the empty string, all entries are output " "[%default].") parser.add_option("-f", "--maskregions-bed-file", dest="filename_masks", type="string", metavar="gff", help="mask sequences with regions given in gff file " "[%default].") parser.add_option("--remove-masked-regions", dest="remove_masked_regions", action="store_true", help="remove regions instead of masking [%default].") parser.add_option("--min-interval-length", dest="min_length", type="int", help="set minimum length for sequences output " "[%default]") parser.add_option("--max-length", dest="max_length", type="int", help="set maximum length for sequences output " "[%default]") parser.add_option("--extend-at", dest="extend_at", type="choice", choices=("none", "3", "5", "both", "3only", "5only"), help="extend at no end, 3', 5' or both ends. If " "3only or 5only are set, only the added sequence " "is returned [default=%default]") parser.add_option("--extend-by", dest="extend_by", type="int", help="extend by # bases [default=%default]") parser.add_option("--extend-with", dest="extend_with", type="string", help="extend using base [default=%default]") parser.add_option("--masker", dest="masker", type="choice", choices=("dust", "dustmasker", "softmask", "none"), help="apply masker [%default].") parser.add_option("--fold-at", dest="fold_at", type="int", help="fold sequence every n bases[%default].") parser.add_option( "--fasta-name-attribute", dest="naming_attribute", type="string", help="use attribute to name fasta entry. Currently only compatable" " with gff format [%default].") parser.set_defaults(is_gtf=False, genome_file=None, merge=False, feature=None, filename_masks=None, remove_masked_regions=False, min_length=0, max_length=0, extend_at=None, extend_by=100, extend_with=None, masker=None, fold_at=None, naming_attribute=False) (options, args) = E.Start(parser) if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = fasta.getContigSizes() if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator(options.stdin)) else: gffs = GTF.iterator(options.stdin) if options.merge: iterator = GTF.joined_iterator(gffs) else: iterator = GTF.chunk_iterator(gffs) masks = None if options.filename_masks: masks = {} with IOTools.openFile(options.filename_masks, "r") as infile: e = GTF.readAsIntervals(GTF.iterator(infile)) # convert intervals to intersectors for contig in list(e.keys()): intersector = bx.intervals.intersection.Intersecter() for start, end in e[contig]: intersector.add_interval(bx.intervals.Interval(start, end)) masks[contig] = intersector ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0 nskipped_length = 0 nskipped_noexons = 0 feature = options.feature # iterator is a list containing groups (lists) of features. # Each group of features have in common the same transcript ID, in case of # GTF files. for ichunk in iterator: ninput += 1 if feature: chunk = [x for x in ichunk if x.feature == feature] else: chunk = ichunk if len(chunk) == 0: nskipped_noexons += 1 E.info("no features in entry from " "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start, ichunk[0].end, str(ichunk[0]))) continue contig, strand = chunk[0].contig, chunk[0].strand if options.is_gtf: name = chunk[0].transcript_id else: if options.naming_attribute: attr_dict = { x.split("=")[0]: x.split("=")[1] for x in chunk[0].attributes.split(";") } name = attr_dict[options.naming_attribute] else: name = str(chunk[0].attributes) lcontig = contigs[contig] positive = Genomics.IsPositiveStrand(strand) intervals = [(x.start, x.end) for x in chunk] intervals.sort() if masks: if contig in masks: masked_regions = [] for start, end in intervals: masked_regions += [(x.start, x.end) for x in masks[contig].find(start, end)] masked_regions = Intervals.combine(masked_regions) if len(masked_regions): nmasked += 1 if options.remove_masked_regions: intervals = Intervals.truncate(intervals, masked_regions) else: raise NotImplementedError("unimplemented") if len(intervals) == 0: nskipped_masked += 1 if options.loglevel >= 1: options.stdlog.write( "# skipped because fully masked: " "%s: regions=%s masks=%s\n" % (name, str([(x.start, x.end) for x in chunk]), masked_regions)) continue out = intervals if options.extend_at and not options.extend_with: if options.extend_at == "5only": intervals = [(max(0, intervals[0][0] - options.extend_by), intervals[0][0])] elif options.extend_at == "3only": intervals = [(intervals[-1][1], min(lcontig, intervals[-1][1] + options.extend_by))] else: if options.extend_at in ("5", "both"): intervals[0] = (max(0, intervals[0][0] - options.extend_by), intervals[0][1]) if options.extend_at in ("3", "both"): intervals[-1] = (intervals[-1][0], min(lcontig, intervals[-1][1] + options.extend_by)) if not positive: intervals = [(lcontig - x[1], lcontig - x[0]) for x in intervals[::-1]] out.reverse() s = [ fasta.getSequence(contig, strand, start, end) for start, end in intervals ] # IMS: allow for masking of sequences s = Masker.maskSequences(s, options.masker) l = sum([len(x) for x in s]) if (l < options.min_length or (options.max_length and l > options.max_length)): nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write("# skipped because length out of bounds " "%s: regions=%s len=%i\n" % (name, str(intervals), l)) continue if options.extend_at and options.extend_with: extension = "".join((options.extend_with, ) * options.extend_by) if options.extend_at in ("5", "both"): s[1] = extension + s[1] if options.extend_at in ("3", "both"): s[-1] = s[-1] + extension if options.fold_at: n = options.fold_at s = "".join(s) seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)]) else: seq = "\n".join(s) options.stdout.write( ">%s %s:%s:%s\n%s\n" % (name, contig, strand, ";".join(["%i-%i" % x for x in out]), seq)) noutput += 1 E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, " "nskipped_masked=%i, nskipped_length=%i" % (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked, nskipped_length)) E.Stop()
def getSequence(self, contig, strand="+", start=0, end=0, converter=None, as_array=False): """get a genomic fragment. A genomic fragment is identified by the coordinates contig, strand, start, end. The converter function supplied translated these coordinates into 0-based coordinates. By default, start and end are assumed to be pythonic coordinates and are forward/reverse coordinates. If as_array is set to true, return the AString object. This might be beneficial for large sequence chunks. If as_array is set to False, return a python string. """ contig = self.getToken(contig) data = self.mIndex[contig] # dummy is # -> pos_seq for seekable streams # -> block_size for unseekable streams try: pos_id, dummy, lsequence = struct.unpack("QQi", data) except (struct.error, TypeError): pos_id, dummy, lsequence, points = data pos_seq = dummy block_size = dummy if end == 0: end = lsequence if end > lsequence: raise ValueError("3' coordinate on %s out of bounds: %i > %i" % (contig, end, lsequence)) if start < 0: raise ValueError("5' coordinate on %s out of bounds: %i < 0" % (contig, start)) if converter: first_pos, last_pos = converter(start, end, str(strand) in ("+", "1"), lsequence) elif self.mConverter: first_pos, last_pos = self.mConverter(start, end, str(strand) in ("+", "1"), lsequence) else: first_pos, last_pos = start, end if str(strand) in ("-", "0", "-1"): first_pos, last_pos = lsequence - last_pos, lsequence - first_pos if first_pos == last_pos: return "" assert first_pos < last_pos, "first position %i is larger than last position %i " % (first_pos, last_pos) p = AString() if self.mNoSeek: # read directly from position p.fromstring(self.mDatabaseFile.read(block_size, data[3], first_pos, last_pos)) else: first_pos += pos_seq last_pos += pos_seq self.mDatabaseFile.seek(first_pos) p.fromstring(self.mDatabaseFile.read(last_pos - first_pos)) if str(strand) in ("-", "0", "-1"): p = AString(Genomics.complement(str(p))) if self.mTranslator: return self.mTranslator.translate(p) elif as_array: return p else: if IS_PY3: return p.tostring().decode("ascii") else: return p.tostring()
def _buildAllele(allele_id, transcript, exons, introns, offsets, virtual_coordinates=False, reference_exons=None): def _getOffset(pos, offsets): x = 0 while x < len(offsets) and offsets[x][0] <= pos: x += 1 x -= 1 if x >= 0: return offsets[x][1] else: return 0 def _sumIndels(ss): '''sum indels within ss''' c = 0 for s in ss: c += len(s) - 1 return c def _getEndOffsets(ss): '''get the offset at exons due to deletions at start/end of exon.''' l = len(ss) x = 0 while x < l and ss[x] == "": x += 1 start_offset = x x = l - 1 while x >= 0 and ss[x] == "": x -= 1 if x >= 0: return start_offset, (l - 1) - x else: return start_offset, 0 def _addCds2Reference(map_cds2reference, cds_start, cds_seq, reference_start): '''add cds to reference''' c, r = cds_start, reference_start for x in cds_seq: l = len(x) if l == 0: r += 1 else: map_cds2reference.addPair(c, r) c += l r += 1 # counts is_splice_truncated = False is_nmd_knockout = False is_stop_truncated = False nuncorrected_frameshifts = 0 ncorrected_frameshifts = 0 nframeshifts = 0 nsplice_noncanonical = 0 reference_first_stop_start = -1 reference_first_stop_end = -1 # map between the new cds sequence and the reference # sequence map_cds2reference = alignlib_lite.py_makeAlignmentBlocks() ################################################### # process first exon exon = transcript[0] transcript_id = exon.transcript_id # collect offset for exon.start genome_start = exon.start genome_start += _getOffset(genome_start, offsets) lcds, cds = 0, [] cds_starts = [0] # still need to deal with deletions of first base: exon_starts = [genome_start] exon_key = (exon.start, exon.end) exon_sequence = exons[exon_key] exon_seq = "".join(exon_sequence) cds.append(exon_seq) _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start) lcds = len(exon_seq) if len(exon_seq) != exon.end - exon.start: nframeshifts += 1 # add first exon to genome position genome_pos = genome_start + len(exon_seq) last_end = exon.end # correct for deletions at start/end of exon start_offset, end_offset = _getEndOffsets(exon_sequence) # length of original transcript loriginal = sum([x.end - x.start for x in transcript]) if E.global_options.loglevel >= 8: print("%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end)) for x, c in enumerate(exons[exon_key]): if len(c) != 1: print(x + exon.start, ":%s:" % c) print() print(exons[exon_key]) print("genome_pos=", genome_pos, ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), ", len(exon_seq)=", len(exon_seq), ", len(exon)=", exon.end - exon.start, ", offsets=%i,%i," % (start_offset, end_offset), ", offset at start=", getOffset(exon.start, offsets), ", offset at end=", getOffset(exon.end, offsets)) for exon in transcript[1:]: last_exon_sequence = exon_sequence last_start_offset, last_end_offset = start_offset, end_offset # get the next intron/exon parameters exon_key = (exon.start, exon.end) exon_sequence = exons[exon_key] start_offset, end_offset = _getEndOffsets(exon_sequence) intron_key = (last_end, exon.start) if last_end == exon.start: # catch empty introns intron_sequence = [] intron_key = None else: intron_sequence = introns[intron_key] intron_seq = "".join(intron_sequence) ################################################### ################################################### ################################################### # add preceding intron new_exon = True if len(intron_seq) > frameshiftsize: intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType( intron_seq) if intron_name == "unknown": if intron_seq[:2].islower() and intron_seq[-2:].islower(): E.debug( "%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s" % (transcript_id, intron_name, intron_seq5, intron_seq3)) nsplice_noncanonical += 1 else: is_splice_truncated = True E.debug( "%s: transcript has splice truncated allele: %s: %s:%s" % (transcript_id, intron_name, intron_seq5, intron_seq3)) break # start a new exon cds_starts.append(lcds) else: # treat as frameshifting intron # # frame-shifting introns are checked if they are # fixed by indels either in the intron itself or # the terminal exon sequence. To this end, the effective # size of the intron is computed: # effective size of intron = # indels at terminal x bases at previous exon # + size of intron # + indels at terminal x bases at next exon effective_intron_size = len(intron_seq) previous_indels = _sumIndels( last_exon_sequence[max(0, -frameshiftsize):]) next_indels = _sumIndels(exon_sequence[:frameshiftsize]) effective_intron_size += previous_indels + next_indels if previous_indels + next_indels == 0 and len( intron_seq) % 3 == 0: has_stop = "X" in Genomics.translate(intron_seq.upper(), is_seleno=is_seleno) else: has_stop = False if effective_intron_size % 3 == 0 and not has_stop: E.debug( "%s: fixed frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i)" % ( transcript_id, last_end, exon.start, effective_intron_size, len(intron_seq), previous_indels, next_indels, )) # add to previous exon cds.append(intron_seq) lcds += len(intron_seq) ncorrected_frameshifts += 1 new_exon = False else: E.debug( "%s: could not fix frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i, has_stop=%i)" % (transcript_id, last_end, exon.start, effective_intron_size, len(intron_seq), previous_indels, next_indels, has_stop)) nuncorrected_frameshifts += 1 # start a new exon cds_starts.append(lcds) if E.global_options.loglevel >= 8: print("%i: intron_indels (%i-%i):" % (allele_id, last_end, exon.start)) if intron_key: for x, c in enumerate(introns[intron_key]): if len(c) != 1: print(x + last_end, ":%s:" % c) print() print(introns[intron_key]) print( "genome_pos=", genome_pos, ",intron=%i-%i" % (genome_pos, genome_pos + len(intron_seq)), ", len(intron_seq)=", len(intron_seq), ", len(intron)=", exon.start - last_end, ", offset at start=", _getOffset(last_end, offsets), ", offset at end=", _getOffset(exon.start, offsets)) else: print("empty intron") genome_pos += len(intron_seq) # assertion - check if genomic coordinate of intron is consistent # with offset test_offset = _getOffset(exon.start, offsets) is_offset = genome_pos - exon.start assert is_offset == test_offset, "intron offset difference: %i != %i" % ( is_offset, test_offset) ################################################### ################################################### ################################################### # add the exon exon_seq = "".join(exon_sequence) cds.append(exon_seq) if len(exon_seq) != exon.end - exon.start: nframeshifts += 1 if new_exon: if reference_coordinates: exon_starts.append(exon.start + start_offset) else: exon_starts.append(genome_pos) _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start) lcds += len(exon_seq) last_end = exon.end if E.global_options.loglevel >= 8: print("%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end)) for x, c in enumerate(exons[exon_key]): if len(c) != 1: print(x + exon.start, ":%s:" % c) print() print(exons[exon_key]) print("genome_pos=", genome_pos, ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), ", len(exon_seq)=", len(exon_seq), ", len(exon)=", exon.end - exon.start, ", offsets=%i,%i," % (start_offset, end_offset), ", offset at start=", getOffset(exon.start, offsets), ", offset at end=", getOffset(exon.end, offsets)) genome_pos += len(exon_seq) test_offset = _getOffset(exon.end, offsets) is_offset = genome_pos - exon.end assert is_offset == test_offset, "exon offset difference: %i != %i" % ( is_offset, test_offset) cds = "".join(cds) assert lcds == len(cds) # fix incomplete codons at the end of the sequence if lcds % 3 != 0: offset = lcds % 3 cds = cds[:-offset] # add frame correction for transcripts that do not start at frame=0 start_frame = (3 - (int(transcript[0].frame) % 3)) % 3 # n are ignored (? in sequence to deal with genes like Muc2) peptide = Genomics.translate("n" * start_frame + cds, is_seleno=is_seleno, prefer_lowercase=False, ignore_n=True) # find the first stop codon if start_frame != 0: # ignore first, potentially incomplete base pep_first_stop = peptide.upper().find("X", 1) else: pep_first_stop = peptide.upper().find("X") E.debug("%s: translated peptide = %s, first stop at %i" % (transcript_id, peptide, pep_first_stop)) peptide = peptide.replace("?", "x") if E.global_options.loglevel >= 8: E.debug("peptide=%s" % peptide) E.debug("cds=%s" % cds) E.debug("%s: start_frame=%i, first stop at %i/%i" % (transcript_id, start_frame, pep_first_stop, len(peptide))) lpeptide, lcds = len(peptide), len(cds) # check for non-sense mediated decay if pep_first_stop != -1: cds_first_stop = pep_first_stop * 3 - start_frame if cds_first_stop < cds_starts[-1]: if ncorrected_frameshifts or nuncorrected_frameshifts: E.warn( "nmd knockout transcript %s has frameshifts: %i corrected, %i uncorrected" % (transcript_id, ncorrected_frameshifts, nuncorrected_frameshifts)) is_nmd_knockout = True cds = peptide = "" lpeptide, lcds = 0, 0 reference_first_stop_start, reference_first_stop_end = \ (map_cds2reference.mapRowToCol(cds_first_stop), map_cds2reference.mapRowToCol(cds_first_stop + 3)) elif pep_first_stop < len(peptide) - 1: is_stop_truncated = True cds = cds[:cds_first_stop] peptide[:pep_first_stop] lpeptide, lcds = len(peptide), len(cds) reference_first_stop_start, reference_first_stop_end = \ (map_cds2reference.mapRowToCol(cds_first_stop), map_cds2reference.mapRowToCol(cds_first_stop + 3)) else: E.warn( "first stop at %i(cds=%i) ignored: last exon start at %i" % (pep_first_stop, cds_first_stop, cds_starts[-1])) else: # -1 for no stop codon found pep_first_stop = -1 cds_first_stop = -1 lpeptide, lcds = len(peptide), len(cds) if peptide is None and nframeshifts == 0: E.warn( "transcript %s is knockout, though there are no indels - must be nonsense mutation" % (transcript_id)) # build frames frames = [start_frame] start = start_frame l = 0 for end in cds_starts[1:]: l += end - start frames.append((3 - l % 3) % 3) start = end return Allele._make(( cds, peptide, len(cds_starts), cds_starts, exon_starts, frames, is_nmd_knockout, is_splice_truncated, is_stop_truncated, nframeshifts, ncorrected_frameshifts, nuncorrected_frameshifts, pep_first_stop, lpeptide, cds_first_stop, lcds, reference_first_stop_start, reference_first_stop_end, loriginal, nsplice_noncanonical, )), map_cds2reference
if options.method == "full": getAlignment = getAlignmentFull id = 0 for match in Blat.iterator( options.stdin ): if options.loglevel >= 2: options.stdout.write("# %s\n" % str(match)) m = match.getMapQuery2Target() m.moveAlignment( -min(match.mQueryBlockStarts), -min(match.mSbjctBlockStarts) ) q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo ) t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo ) query_ali, sbjct_ali = getAlignment( m, q, t, options ) if match.strand == "-" and options.forward_query: query_ali = Genomics.complement( query_ali ) sbjct_ali = Genomics.complement( sbjct_ali ) options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % \ (options.query_prefix, options.output_format_id % id, match.mQueryId, match.mQueryFrom, match.mQueryTo, query_ali, options.target_prefix, options.output_format_id % id, match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo, sbjct_ali ) ) id += 1 E.Stop()
def Align(self, method, anchor=0, loglevel=1): """align a pair of sequences. get rid of this and use a method class instead in the future """ map_a2b = alignlib_lite.py_makeAlignmentVector() s1 = "A" * anchor + self.mSequence1 + "A" * anchor s2 = "A" * anchor + self.mSequence2 + "A" * anchor self.strand = "+" if method == "dialign": dialign = WrapperDialign.Dialign(self.mOptionsDialign) dialign.Align(s1, s2, map_a2b) elif method == "blastz": blastz = WrapperBlastZ.BlastZ(self.mOptionsBlastZ) blastz.Align(s1, s2, map_a2b) if blastz.isReverseComplement(): self.strand = "-" self.mSequence2 = Genomics.complement(self.mSequence2) elif method == "dialignlgs": dialignlgs = WrapperDialign.Dialign(self.mOptionsDialignLGS) dialignlgs.Align(s1, s2, map_a2b) elif method == "dba": dba = WrapperDBA.DBA() dba.Align(s1, s2, map_a2b) elif method == "clustal": raise NotImplementedError("clustal wrapper needs to be updated") clustal = WrapperClustal.Clustal() clustal.Align(s1, s2, map_a2b) elif method == "nw": seq1 = alignlib_lite.py_makeSequence(s1) seq2 = alignlib_lite.py_makeSequence(s2) alignator = alignlib_lite.py_makeAlignatorDPFull(alignlib_lite.py_ALIGNMENT_GLOBAL, gop=-12.0, gep=-2.0) alignator.align(map_a2b, seq1, seq2) elif method == "sw": seq1 = alignlib_lite.py_makeSequence(s1) seq2 = alignlib_lite.py_makeSequence(s2) alignlib_lite.py_performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw) else: # use callback function method(s1, s2, map_a2b) if map_a2b.getLength() == 0: raise AlignmentError("empty alignment") if anchor: map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo()) map_a2b.removeRowRegion(1, anchor) map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo()) map_a2b.removeColRegion(1, anchor) map_a2b.moveAlignment(-anchor, -anchor) f = alignlib_lite.py_AlignmentFormatExplicit(map_a2b, alignlib_lite.py_makeSequence( self.mSequence1), alignlib_lite.py_makeSequence(self.mSequence2)) self.mMethod = method self.mAlignment = map_a2b self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment self.mAlignmentFrom1 = map_a2b.getRowFrom() self.mAlignmentTo1 = map_a2b.getRowTo() self.mAlignmentFrom2 = map_a2b.getColFrom() self.mAlignmentTo2 = map_a2b.getColTo() self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength() self.mAligned = self.mLength - self.mNumGaps self.SetPercentIdentity() self.SetBlockSizes()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=('join', ), help="method to apply [default=%default].") parser.set_defaults( method="join", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = args c = E.Counter() outfile = options.stdout if options.method == "join": # merge based on diagonals in dotplot iter1 = Fastq.iterate(IOTools.openFile(fn1)) iter2 = Fastq.iterate(IOTools.openFile(fn2)) tuple_size = 2 for left, right in zip(iter1, iter2): c.input += 1 # build dictionary of tuples s1, q1 = left.seq, left.quals d = collections.defaultdict(list) for x in range(len(s1) - tuple_size): d[s1[x:x + tuple_size]].append(x) s2, q2 = right.seq, right.quals # reverse complement s2 = Genomics.complement(s2) q2 = q2[::-1] # compute list of offsets/diagonals offsets = collections.defaultdict(int) for x in range(len(s2) - tuple_size): c = s2[x:x + tuple_size] for y in d[c]: offsets[x - y] += 1 # find maximum diagonal sorted = sorted([(y, x) for x, y in offsets.items()]) max_count, max_offset = sorted[-1] E.debug('%s: maximum offset at %i' % (left.identifier, max_offset)) # simple merge sequence take = len(s2) - max_offset merged_seq = s1 + s2[take:] # simple merge quality scores merged_quals = q1 + q2[take:] new_entry = copy.copy(left) new_entry.seq = merged_seq new_entry.quals = merged_quals outfile.write(new_entry) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$") parser.add_option("-p", "--peptides-fasta-file", dest="filename_peptides", type="string", help="filename with peptide sequences [%default].") parser.add_option("-c", "--cds-gtf-file", "--cdnas", dest="filename_cdna", type="string", help="filename with cdna sequences [%default].") parser.add_option( "-m", "--map", dest="filename_map", type="string", help= "filename with map of peptide identifiers to cdna identifiers [%default]." ) parser.add_option("--output-identifier", dest="output_identifier", type="choice", choices=("cdna", "peptide"), help="output identifier to use [%default].") parser.add_option("-f", "--output-format=", dest="output_format", type="choice", choices=("alignment", "fasta"), help="output format.") parser.set_defaults( peptides=None, filename_cdna=None, output_format="alignment", filename_map=None, stop_codons=("TAG", "TAA", "TGA"), output_identifier="peptide", ) (options, args) = E.Start(parser, add_pipe_options=True) if not options.filename_cdna: raise ValueError("please supply filename with cds sequences.") if options.filename_peptides: infile = open(options.filename_peptides, "r") E.info("reading from %s" % options.filename_peptides) else: E.info("reading from stdin") infile = sys.stdin if options.filename_map: E.info("reading map") map_peptide2cds = IOTools.readMap( IOTools.openFile(options.filename_map, "r")) E.info("read map for %i identifiers" % len(map_peptide2cds)) else: map_peptide2cds = {} E.info("reading cds sequences") cds_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_cdna, "r")) E.info("read %i cds sequences" % len(cds_sequences)) ninput, noutput = 0, 0 nskipped, nnosequence = 0, 0 # iterate over peptide sequences iterator = FastaIterator.FastaIterator(infile) use_cds_id = options.output_identifier == "cds" for cur_record in iterator: ninput += 1 peptide_identifier = re.split("\s+", cur_record.title)[0] cds_identifier = map_peptide2cds.get(peptide_identifier, peptide_identifier) if cds_identifier not in cds_sequences: nnosequence += 1 continue p = cur_record.sequence c = cds_sequences[cds_identifier] E.debug("processing %s: laa=%i (without gaps=%i), lna=%i" % (peptide_identifier, len(p), len(re.sub("-", "", p)), len(c))) try: map_p2c = Peptides2Cds.getMapPeptide2Cds(p, c, options) except ValueError: nskipped += 1 continue if use_cds_id: identifier = cds_identifier else: identifier = peptide_identifier if options.output_format == "alignment": options.stdout.write("\t".join( map(str, (identifier, alignlib_lite.py_AlignmentFormatEmissions(map_p2c), len(cur_record.sequence), len(cds_sequences[identifier])))) + "\n") elif options.output_format == "fasta": map_p2c.switchRowCol() alignatum = alignlib_lite.py_makeAlignatum(c) alignatum.mapOnAlignment(map_p2c, len(p) * 3) s = alignatum.getString() if len(s) != len(p) * 3: raise ValueError( "incomplete aligned string for %s: %s, cds=%s" % (cur_record.title, s, c)) options.stdout.write(">%s\n%s\n" % (identifier, s)) noutput += 1 sys.stdout.flush() E.info("ninput=%i, noutput=%i, nnosequence=%i, nskipped=%i" % (ninput, noutput, nnosequence, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--query-psl-file", dest="filename_query", type="string", help="fasta filename with queries.") parser.add_option("--target-psl-file", dest="filename_target", type="string", help="fasta filename with target.") parser.add_option("-m", "--method", dest="method", type="choice", choices=( "full", "pileup-query", "pileup-target", "gapless"), help="method to use for constructing the alignment [%default].") parser.add_option("--forward-query", dest="forward_query", action="store_true", help="reverse-complement sequences such that query is always on forward strand [%default]") parser.add_option("--target-prefix", dest="target_prefix", type="string", help="prefix to use for target [%default].") parser.add_option("--query-prefix", dest="query_prefix", type="string", help="prefix to use for query [%default].") parser.add_option("--id", dest="id", type="choice", choices=("numeric", "query"), help="choose type of identifier to use [%default]") parser.set_defaults( filename_query=None, filename_target=None, method="full", output_format_id="%06i", target_prefix="", query_prefix="", forward_query=False, ) (options, args) = E.Start(parser) if options.filename_query: query = IndexedFasta.IndexedFasta(options.filename_query) if options.filename_target: target = IndexedFasta.IndexedFasta(options.filename_target) if options.method == "full": getAlignment = getAlignmentFull id = 0 for match in Blat.iterator(options.stdin): if options.loglevel >= 2: options.stdout.write("# %s\n" % str(match)) m = match.getMapQuery2Target() m.moveAlignment(-min(match.mQueryBlockStarts), - min(match.mSbjctBlockStarts)) q = query.getSequence( match.mQueryId, match.strand, match.mQueryFrom, match.mQueryTo) t = target.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) query_ali, sbjct_ali = getAlignment(m, q, t, options) if match.strand == "-" and options.forward_query: query_ali = Genomics.complement(query_ali) sbjct_ali = Genomics.complement(sbjct_ali) options.stdout.write(">%s%s:%s/%i-%i\n%s\n>%s%s:%s%s/%i-%i\n%s\n" % (options.query_prefix, options.output_format_id % id, match.mQueryId, match.mQueryFrom, match.mQueryTo, query_ali, options.target_prefix, options.output_format_id % id, match.mSbjctId, match.strand, match.mSbjctFrom, match.mSbjctTo, sbjct_ali)) id += 1 E.Stop()
def _buildAllele(allele_id, transcript, exons, introns, offsets, virtual_coordinates=False, reference_exons=None): def _getOffset(pos, offsets): x = 0 while x < len(offsets) and offsets[x][0] <= pos: x += 1 x -= 1 if x >= 0: return offsets[x][1] else: return 0 def _sumIndels(ss): '''sum indels within ss''' c = 0 for s in ss: c += len(s) - 1 return c def _getEndOffsets(ss): '''get the offset at exons due to deletions at start/end of exon.''' l = len(ss) x = 0 while x < l and ss[x] == "": x += 1 start_offset = x x = l - 1 while x >= 0 and ss[x] == "": x -= 1 if x >= 0: return start_offset, (l - 1) - x else: return start_offset, 0 def _addCds2Reference(map_cds2reference, cds_start, cds_seq, reference_start): '''add cds to reference''' c, r = cds_start, reference_start for x in cds_seq: l = len(x) if l == 0: r += 1 else: map_cds2reference.addPair(c, r) c += l r += 1 # counts is_splice_truncated = False is_nmd_knockout = False is_stop_truncated = False nuncorrected_frameshifts = 0 ncorrected_frameshifts = 0 nframeshifts = 0 nsplice_noncanonical = 0 reference_first_stop_start = -1 reference_first_stop_end = -1 # map between the new cds sequence and the reference # sequence map_cds2reference = alignlib_lite.py_makeAlignmentBlocks() ################################################### # process first exon exon = transcript[0] transcript_id = exon.transcript_id # collect offset for exon.start genome_start = exon.start genome_start += _getOffset(genome_start, offsets) lcds, cds = 0, [] cds_starts = [0] # still need to deal with deletions of first base: exon_starts = [genome_start] exon_key = (exon.start, exon.end) exon_sequence = exons[exon_key] exon_seq = "".join(exon_sequence) cds.append(exon_seq) _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start) lcds = len(exon_seq) if len(exon_seq) != exon.end - exon.start: nframeshifts += 1 # add first exon to genome position genome_pos = genome_start + len(exon_seq) last_end = exon.end # correct for deletions at start/end of exon start_offset, end_offset = _getEndOffsets(exon_sequence) # length of original transcript loriginal = sum([x.end - x.start for x in transcript]) if E.global_options.loglevel >= 8: print "%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end) for x, c in enumerate(exons[exon_key]): if len(c) != 1: print x + exon.start, ":%s:" % c print print exons[exon_key] print "genome_pos=", genome_pos, \ ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), \ ", len(exon_seq)=", len(exon_seq), \ ", len(exon)=", exon.end - exon.start, \ ", offsets=%i,%i," % (start_offset, end_offset), \ ", offset at start=", _getOffset( exon.start, offsets), \ ", offset at end=", _getOffset(exon.end, offsets) for exon in transcript[1:]: last_exon_sequence = exon_sequence last_start_offset, last_end_offset = start_offset, end_offset # get the next intron/exon parameters exon_key = (exon.start, exon.end) exon_sequence = exons[exon_key] start_offset, end_offset = _getEndOffsets(exon_sequence) intron_key = (last_end, exon.start) if last_end == exon.start: # catch empty introns intron_sequence = [] intron_key = None else: intron_sequence = introns[intron_key] intron_seq = "".join(intron_sequence) ################################################### ################################################### ################################################### # add preceding intron new_exon = True if len(intron_seq) > frameshiftsize: intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType( intron_seq) if intron_name == "unknown": if intron_seq[:2].islower() and intron_seq[-2:].islower(): E.debug("%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s" % (transcript_id, intron_name, intron_seq5, intron_seq3)) nsplice_noncanonical += 1 else: is_splice_truncated = True E.debug("%s: transcript has splice truncated allele: %s: %s:%s" % (transcript_id, intron_name, intron_seq5, intron_seq3)) break # start a new exon cds_starts.append(lcds) else: # treat as frameshifting intron # # frame-shifting introns are checked if they are # fixed by indels either in the intron itself or # the terminal exon sequence. To this end, the effective # size of the intron is computed: # effective size of intron = # indels at terminal x bases at previous exon # + size of intron # + indels at terminal x bases at next exon effective_intron_size = len(intron_seq) previous_indels = _sumIndels( last_exon_sequence[max(0, -frameshiftsize):]) next_indels = _sumIndels(exon_sequence[:frameshiftsize]) effective_intron_size += previous_indels + next_indels if previous_indels + next_indels == 0 and len(intron_seq) % 3 == 0: has_stop = "X" in Genomics.translate(intron_seq.upper(), is_seleno=is_seleno) else: has_stop = False if effective_intron_size % 3 == 0 and not has_stop: E.debug("%s: fixed frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i)" % (transcript_id, last_end, exon.start, effective_intron_size, len(intron_seq), previous_indels, next_indels,)) # add to previous exon cds.append(intron_seq) lcds += len(intron_seq) ncorrected_frameshifts += 1 new_exon = False else: E.debug("%s: could not fix frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i, has_stop=%i)" % (transcript_id, last_end, exon.start, effective_intron_size, len(intron_seq), previous_indels, next_indels, has_stop)) nuncorrected_frameshifts += 1 # start a new exon cds_starts.append(lcds) if E.global_options.loglevel >= 8: print "%i: intron_indels (%i-%i):" % (allele_id, last_end, exon.start) if intron_key: for x, c in enumerate(introns[intron_key]): if len(c) != 1: print x + last_end, ":%s:" % c print print introns[intron_key] print "genome_pos=", genome_pos, \ ",intron=%i-%i" % (genome_pos, genome_pos + len(intron_seq)), \ ", len(intron_seq)=", len(intron_seq), \ ", len(intron)=", exon.start - last_end, \ ", offset at start=", _getOffset( last_end, offsets), \ ", offset at end=", _getOffset(exon.start, offsets) else: print "empty intron" genome_pos += len(intron_seq) # assertion - check if genomic coordinate of intron is consistent # with offset test_offset = _getOffset(exon.start, offsets) is_offset = genome_pos - exon.start assert is_offset == test_offset, "intron offset difference: %i != %i" % ( is_offset, test_offset) ################################################### ################################################### ################################################### # add the exon exon_seq = "".join(exon_sequence) cds.append(exon_seq) if len(exon_seq) != exon.end - exon.start: nframeshifts += 1 if new_exon: if reference_coordinates: exon_starts.append(exon.start + start_offset) else: exon_starts.append(genome_pos) _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start) lcds += len(exon_seq) last_end = exon.end if E.global_options.loglevel >= 8: print "%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end) for x, c in enumerate(exons[exon_key]): if len(c) != 1: print x + exon.start, ":%s:" % c print print exons[exon_key] print "genome_pos=", genome_pos, \ ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), \ ", len(exon_seq)=", len(exon_seq), \ ", len(exon)=", exon.end - exon.start, \ ", offsets=%i,%i," % (start_offset, end_offset), \ ", offset at start=", _getOffset( exon.start, offsets), \ ", offset at end=", _getOffset(exon.end, offsets) genome_pos += len(exon_seq) test_offset = _getOffset(exon.end, offsets) is_offset = genome_pos - exon.end assert is_offset == test_offset, "exon offset difference: %i != %i" % ( is_offset, test_offset) cds = "".join(cds) assert lcds == len(cds) # fix incomplete codons at the end of the sequence if lcds % 3 != 0: offset = lcds % 3 cds = cds[:-offset] # add frame correction for transcripts that do not start at frame=0 start_frame = (3 - (int(transcript[0].frame) % 3)) % 3 # n are ignored (? in sequence to deal with genes like Muc2) peptide = Genomics.translate("n" * start_frame + cds, is_seleno=is_seleno, prefer_lowercase=False, ignore_n=True) # find the first stop codon if start_frame != 0: # ignore first, potentially incomplete base pep_first_stop = peptide.upper().find("X", 1) else: pep_first_stop = peptide.upper().find("X") E.debug("%s: translated peptide = %s, first stop at %i" % (transcript_id, peptide, pep_first_stop)) peptide = peptide.replace("?", "x") if E.global_options.loglevel >= 8: E.debug("peptide=%s" % peptide) E.debug("cds=%s" % cds) E.debug("%s: start_frame=%i, first stop at %i/%i" % (transcript_id, start_frame, pep_first_stop, len(peptide))) lpeptide, lcds = len(peptide), len(cds) # check for non-sense mediated decay if pep_first_stop != -1: cds_first_stop = pep_first_stop * 3 - start_frame if cds_first_stop < cds_starts[-1]: if ncorrected_frameshifts or nuncorrected_frameshifts: E.warn("nmd knockout transcript %s has frameshifts: %i corrected, %i uncorrected" % (transcript_id, ncorrected_frameshifts, nuncorrected_frameshifts)) is_nmd_knockout = True cds = peptide = "" lpeptide, lcds = 0, 0 reference_first_stop_start, reference_first_stop_end = \ (map_cds2reference.mapRowToCol(cds_first_stop), map_cds2reference.mapRowToCol(cds_first_stop + 3)) elif pep_first_stop < len(peptide) - 1: is_stop_truncated = True cds = cds[:cds_first_stop] peptide[:pep_first_stop] lpeptide, lcds = len(peptide), len(cds) reference_first_stop_start, reference_first_stop_end = \ (map_cds2reference.mapRowToCol(cds_first_stop), map_cds2reference.mapRowToCol(cds_first_stop + 3)) else: E.warn("first stop at %i(cds=%i) ignored: last exon start at %i" % (pep_first_stop, cds_first_stop, cds_starts[-1])) else: # -1 for no stop codon found pep_first_stop = -1 cds_first_stop = -1 lpeptide, lcds = len(peptide), len(cds) if peptide is None and nframeshifts == 0: E.warn( "transcript %s is knockout, though there are no indels - must be nonsense mutation" % (transcript_id)) # build frames frames = [start_frame] start = start_frame l = 0 for end in cds_starts[1:]: l += end - start frames.append((3 - l % 3) % 3) start = end return Allele._make((cds, peptide, len(cds_starts), cds_starts, exon_starts, frames, is_nmd_knockout, is_splice_truncated, is_stop_truncated, nframeshifts, ncorrected_frameshifts, nuncorrected_frameshifts, pep_first_stop, lpeptide, cds_first_stop, lcds, reference_first_stop_start, reference_first_stop_end, loriginal, nsplice_noncanonical, )), map_cds2reference
genome = IndexedFasta.IndexedFasta( options.genome_file ) assert options.filename_regions != None, "please supply a gff formatted filename with regions" regions = GTF.readAsIntervals( GFF.iterator( IOTools.openFile(options.filename_regions, "r" ) ) ) # build pairs for complement reverse_splice_pairs = [] forward_splice_pairs = options.splice_pairs left_tokens, right_tokens = {}, {} x = 0 for a,b in forward_splice_pairs: assert len(a) == 2, "only two-residue patterns allowed" assert len(b) == 2, "only two-residue patterns allowed" ca, cb = Genomics.complement( a ), Genomics.complement( b ) reverse_splice_pairs.append( (b,a) ) left_tokens[a] = x left_tokens[cb] = x+1 right_tokens[b] = x right_tokens[ca] = x+1 x += 2 search_area = options.search_area read_length = options.read_length joined = options.joined ninput, noutput = 0, 0 if joined: outfile_coordinates = IOTools.openFile( options.output_filename_pattern % "coords", "w" )
if param_filename_benchmark_synonyms: infile = open(param_filename_benchmark_synonyms, "r") param_benchmark_synonyms = {} for line in infile: if line[0] == "#": continue value, key = line[:-1].split("\t") param_benchmark_synonyms[key] = value else: param_benchmark_synonyms = {} else: param_benchmarks = {} param_benchmark_synonyms = {} # read peptide sequences if param_filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(param_filename_peptides, "r")) else: peptide_sequences = {} if param_conserve_memory: old_predictions, filename_old_predictions = tempfile.mkstemp() os.close(old_predictions) old_predictions = PredictionFile.PredictionFile() old_predictions.open(filename_old_predictions, "w") else: ## array with final predictions old_predictions = [] if param_loglevel >= 1: print "# reading predictions." sys.stdout.flush()
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--merge-exons", dest="merge_exons", action="store_true", help="merge overlapping exons of all transcripts " "within a gene. " "The merged exons will be output. " "Input needs to sorted by gene [default=%default].") parser.add_option("-t", "--merge-transcripts", dest="merge_transcripts", action="store_true", help="merge all transcripts within a gene. " "The entry will span the whole gene " "(exons and introns). " "The transcript does not include the UTR unless " "--with-utr is set. [default=%default].") parser.add_option("--merge-genes", dest="merge_genes", action="store_true", help="merge overlapping genes if their exons overlap. " "A gene with a single transcript containing all exons " "of the overlapping transcripts will be output. " "This operation ignores strand information " "The input needs te sorted by transcript " "[default=%default].") parser.add_option("--merge-exons-distance", dest="merge_exons_distance", type="int", help="distance in nucleotides between " "exons to be merged [default=%default].") parser.add_option("-j", "--join-exons", dest="join_exons", action="store_true", help="join all exons per transcript. " "A new transcript will be " "output that spans a whole transcript. " "Input needs to be sorted by transcript " "[default=%default].") parser.add_option("--unset-genes", dest="unset_genes", type="string", help="unset gene identifiers, keeping " "transcripts intact. " "New gene identifiers are set to the " "pattern given. For example, " "'--unset-genes=%06i' [default=%default].") parser.add_option("--sort", dest="sort", type="choice", choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"), help="sort input data [default=%default].") parser.add_option("-u", "--with-utr", dest="with_utr", action="store_true", help="include utr in merged transcripts " "[default=%default].") parser.add_option("--intersect-transcripts", dest="intersect_transcripts", action="store_true", help="intersect all transcripts within a gene. " "The entry will only span those bases " "that are covered by all transcrips." "The transcript does not include the UTR unless " "--with-utr is set. This method " "will remove all other features (stop_codon, etc.) " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-i", "--merge-introns", dest="merge_introns", action="store_true", help="merge and output all introns within a " "gene. The output will contain " "all intronic regions within a gene. Single exon genes " "are skipped. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-g", "--set-transcript-to-gene", "--set-transcript2gene", dest="set_transcript2gene", action="store_true", help="set the transcript_id to the " "gene_id [default=%default].") parser.add_option("--set-protein-to-transcript", dest="set_protein2transcript", action="store_true", help="set the protein_id to the " "transcript_id [default=%default].") parser.add_option("--add-protein-id", dest="add_protein_id", type="string", help="add a protein_id for each transcript_id. " "The argument is a filename containing a mapping " "between " "transcript_id to protein_id [default=%default].") parser.add_option("-G", "--set-gene-to-transcript", "--set-gene2transcript", dest="set_gene2transcript", action="store_true", help="set the gene_id to the " "transcript_id [default=%default].") parser.add_option("-d", "--set-score2distance", dest="set_score2distance", action="store_true", help="set the score field for each feature to the " "distance to " "transcription start site [default=%default].") parser.add_option("--exons2introns", dest="exons2introns", action="store_true", help="for each gene build an 'intronic' transcript " "containing the union of all intronic regions " "of all transcripts in a gene." "The features are labeled as 'intron'." "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-f", "--filter", dest="filter", type="choice", choices=("gene", "transcript", "longest-gene", "longest-transcript", "representative-transcript"), help="apply a filter to the input file. Available " "filters are: " "'gene': filter by gene_id, " "'transcript': filter by transcript_id, " "'longest-gene': output the longest gene for " "overlapping genes ," "'longest-transcript': output the longest " "transcript per gene," "'representative-transcript': output the " "representative transcript per gene. " "The representative transcript is the transcript " "that shares most exons with " "the other transcripts in a gene. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-r", "--rename", dest="rename", type="choice", choices=("gene", "transcript"), help="rename genes or transcripts with a map " "given by the option `--apply`. " "Those that can not be renamed are removed " "[default=%default].") parser.add_option("--renumber-genes", dest="renumber_genes", type="string", help="renumber genes according to the given pattern. " "[default=%default].") parser.add_option("--renumber-transcripts", dest="renumber_transcripts", type="string", help="renumber transcripts according to the " "given pattern. " "[default=%default].") parser.add_option("-a", "--apply", dest="filename_filter", type="string", metavar="tsv", help="filename of ids to map/filter [default=%default].") parser.add_option("--invert-filter", dest="invert_filter", action="store_true", help="when using --filter, invert selection " "(like grep -v). " "[default=%default].") parser.add_option("--sample-size", dest="sample_size", type="int", help="extract a random sample of size # if the option " "'--filter' is set[default=%default].") parser.add_option("--intron-min-length", dest="intron_min_length", type="int", help="minimum length for introns (for --exons2introns) " "[default=%default].") parser.add_option("--min-exons-length", dest="min_exons_length", type="int", help="minimum length for gene (sum of exons) " "(--sample-size) [default=%default].") parser.add_option("--intron-border", dest="intron_border", type="int", help="number of residues to exclude at intron at either end " "(--exons2introns) [default=%default].") parser.add_option("--transcripts2genes", dest="transcripts2genes", action="store_true", help="cluster overlapping transcripts into genes.") parser.add_option("--reset-strand", dest="reset_strand", action="store_true", help="remove strandedness of features (set to '.') when " "using --transcripts2genes" "[default=%default].") parser.add_option("--remove-overlapping", dest="remove_overlapping", type="string", metavar="gff", help="remove all transcripts that overlap intervals " "in a gff-formatted file." "The comparison ignores strand " "[default=%default].") parser.add_option("--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[default=%default]") parser.add_option("--remove-duplicates", dest="remove_duplicates", type="choice", choices=("gene", "transcript", "ucsc", "coordinates"), help="remove duplicates by gene/transcript. " "If ``ucsc`` is chosen, transcripts ending on _dup# are " "removed. This is necessary to remove duplicate entries " "that are next to each other in the sort order " "[%default]") parser.add_option("--rename-duplicates", dest="rename_duplicates", action="store_true", help="rename duplicate gene_ids and transcript_ids by " "addition of a numerical suffix") parser.set_defaults( sort=None, merge_exons=False, join_exons=False, merge_exons_distance=0, merge_transcripts=False, set_score2distance=False, set_gene2transcript=False, set_transcript2gene=False, set_protein2transcript=False, add_protein_id=None, filename_filter=None, filter=None, exons2introns=None, merge_genes=False, intron_border=None, intron_min_length=None, sample_size=0, min_exons_length=0, transripts2genes=False, reset_strand=False, with_utr=False, invert_filter=False, remove_duplicates=None, remove_overlapping=None, renumber_genes=None, unset_genes=None, renumber_transcripts=None, strict=True, intersect_transcripts=False, rename_duplicates=False, ) (options, args) = E.Start(parser, argv=argv) ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0 if options.set_transcript2gene: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("transcript_id", gff.gene_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.remove_duplicates: counts = collections.defaultdict(int) if options.remove_duplicates == "ucsc": store = [] remove = set() f = lambda x: x[0].transcript_id gffs = GTF.transcript_iterator( GTF.iterator(options.stdin), strict=False) outf = lambda x: "\n".join([str(y) for y in x]) for entry in gffs: ninput += 1 store.append(entry) id = f(entry) if "_dup" in id: remove.add(re.sub("_dup\d+", "", id)) remove.add(id) for entry in store: id = f(entry) if id not in remove: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s" % (id)) else: if options.remove_duplicates == "gene": gffs = GTF.gene_iterator( GTF.iterator(options.stdin), strict=False) f = lambda x: x[0][0].gene_id outf = lambda x: "\n".join( ["\n".join([str(y) for y in xx]) for xx in x]) elif options.remove_duplicates == "transcript": gffs = GTF.transcript_iterator( GTF.iterator(options.stdin), strict=False) f = lambda x: x[0].transcript_id outf = lambda x: "\n".join([str(y) for y in x]) elif options.remove_duplicates == "coordinates": gffs = GTF.chunk_iterator(GTF.iterator(options.stdin)) f = lambda x: x[0].contig + "_" + \ str(x[0].start) + "-" + str(x[0].end) outf = lambda x: "\n".join([str(y) for y in x]) store = [] for entry in gffs: ninput += 1 store.append(entry) id = f(entry) counts[id] += 1 # Assumes GTF file sorted by contig then start last_id = "" if options.remove_duplicates == "coordinates": for entry in store: id = f(entry) if id == last_id: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) else: options.stdout.write(outf(entry) + "\n") noutput += 1 last_id = id else: for entry in store: id = f(entry) if counts[id] == 1: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) elif options.sort: for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort): ninput += 1 options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_gene2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("gene_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_protein2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("protein_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.add_protein_id: transcript2protein = IOTools.readMap(open(options.add_protein_id, "r")) missing = set() for gff in GTF.iterator(options.stdin): ninput += 1 if gff.transcript_id not in transcript2protein: if gff.transcript_id not in missing: E.debug( ("removing transcript '%s' due to " "missing protein id") % gff.transcript_id) missing.add(gff.transcript_id) ndiscarded += 1 continue gff.setAttribute( "protein_id", transcript2protein[gff.transcript_id]) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 E.info("transcripts removed due to missing protein ids: %i" % len(missing)) elif options.join_exons: for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(exons[0].strand) contig = exons[0].contig transid = exons[0].transcript_id geneid = exons[0].gene_id biotype = exons[0].source all_start, all_end = min([x.start for x in exons]), max( [x.end for x in exons]) y = GTF.Entry() y.contig = contig y.source = biotype y.feature = "transcript" y.start = all_start y.end = all_end y.strand = strand y.transcript_id = transid y.gene_id = geneid options.stdout.write("%s\n" % str(y)) elif options.merge_genes: # merges overlapping genes # gffs = GTF.iterator_sorted_chunks( GTF.flat_gene_iterator(GTF.iterator(options.stdin)), sort_by="contig-strand-start") def iterate_chunks(gff_chunks): last = gff_chunks.next() to_join = [last] for gffs in gff_chunks: d = gffs[0].start - last[-1].end if gffs[0].contig == last[0].contig and \ gffs[0].strand == last[0].strand: assert gffs[0].start >= last[0].start, \ ("input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \ (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs])) if gffs[0].contig != last[0].contig or \ gffs[0].strand != last[0].strand or \ d > 0: yield to_join to_join = [] last = gffs to_join.append(gffs) yield to_join raise StopIteration for chunks in iterate_chunks(gffs): ninput += 1 if len(chunks) > 1: gene_id = "merged_%s" % chunks[0][0].gene_id transcript_id = "merged_%s" % chunks[0][0].transcript_id info = ",".join([x[0].gene_id for x in chunks]) else: gene_id = chunks[0][0].gene_id transcript_id = chunks[0][0].transcript_id info = None intervals = [] for c in chunks: intervals += [(x.start, x.end) for x in c] intervals = Intervals.combine(intervals) # take single strand strand = chunks[0][0].strand for start, end in intervals: y = GTF.Entry() y.fromGTF(chunks[0][0], gene_id, transcript_id) y.start = start y.end = end y.strand = strand if info: y.addAttribute("merged", info) options.stdout.write("%s\n" % str(y)) nfeatures += 1 noutput += 1 elif options.renumber_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 if gtf.gene_id not in map_old2new: map_old2new[gtf.gene_id] = options.renumber_genes % ( len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[gtf.gene_id]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.unset_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = gtf.transcript_id if key not in map_old2new: map_old2new[key] = options.unset_genes % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.renumber_transcripts: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = (gtf.gene_id, gtf.transcript_id) if key not in map_old2new: map_old2new[key] = options.renumber_transcripts % ( len(map_old2new) + 1) gtf.setAttribute("transcript_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.transcripts2genes: transcripts = set() genes = set() reset_strand = options.reset_strand for gtfs in GTF.iterator_transcripts2genes( GTF.iterator(options.stdin)): ninput += 1 for gtf in gtfs: if reset_strand: gtf.strand = "." options.stdout.write("%s\n" % str(gtf)) transcripts.add(gtf.transcript_id) genes.add(gtf.gene_id) nfeatures += 1 noutput += 1 E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes))) elif options.rename: map_old2new = IOTools.readMap(open(options.filename_filter, "r")) if options.rename == "transcript": is_gene_id = False elif options.rename == "gene": is_gene_id = True for gff in GTF.iterator(options.stdin): ninput += 1 if is_gene_id: if gff.gene_id in map_old2new: gff.setAttribute("gene_id", map_old2new[gff.gene_id]) else: E.debug("removing missing gene_id %s" % gff.gene_id) ndiscarded += 1 continue else: if gff.transcript_id in map_old2new: gff.setAttribute( "transcript_id", map_old2new[gff.transcript_id]) else: E.debug("removing missing transcript_id %s" % gff.transcript_id) ndiscarded += 1 continue noutput += 1 options.stdout.write("%s\n" % str(gff)) elif options.filter: keep_genes = set() if options.filter == "longest-gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) coords = [] gffs = [] for gff in iterator: gff.sort(key=lambda x: x.start) coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id)) gffs.append(gff) coords.sort() last_contig = None max_end = 0 longest_gene_id = None longest_length = None for contig, start, end, gene_id in coords: ninput += 1 if contig != last_contig or start >= max_end: if longest_gene_id: keep_genes.add(longest_gene_id) longest_gene_id = gene_id longest_length = end - start max_end = end else: if end - start > longest_length: longest_length, longest_gene_id = end - start, gene_id last_contig = contig max_end = max(max_end, end) keep_genes.add(longest_gene_id) invert = options.invert_filter for gff in gffs: keep = gff[0].gene_id in keep_genes if (keep and not invert) or (not keep and invert): noutput += 1 for g in gff: nfeatures += 1 options.stdout.write("%s\n" % g) else: ndiscarded += 1 elif options.filter in ("longest-transcript", "representative-transcript"): iterator = GTF.gene_iterator(GTF.iterator(options.stdin)) def selectLongestTranscript(gene): r = [] for transcript in gene: transcript.sort(key=lambda x: x.start) length = transcript[-1].end - transcript[0].start r.append((length, transcript)) r.sort() return r[-1][1] def selectRepresentativeTranscript(gene): '''select a representative transcript. The representative transcript represent the largest number of exons over all transcripts. ''' all_exons = [] for transcript in gene: all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"]) exon_counts = {} for key, exons in itertools.groupby(all_exons): exon_counts[key] = len(list(exons)) transcript_counts = [] for transcript in gene: count = sum([exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon"]) transcript_counts.append((count, transcript)) transcript_counts.sort() return transcript_counts[-1][1] if options.filter == "longest-transcript": _select = selectLongestTranscript elif options.filter == "representative-transcript": _select = selectRepresentativeTranscript for gene in iterator: ninput += 1 transcript = _select(gene) noutput += 1 for g in transcript: nfeatures += 1 options.stdout.write("%s\n" % g) elif options.filter in ("gene", "transcript"): if options.filename_filter: ids, nerrors = IOTools.ReadList( open(options.filename_filter, "r")) E.info("read %i ids" % len(ids)) ids = set(ids) by_gene = options.filter == "gene" by_transcript = options.filter == "transcript" invert = options.invert_filter reset_strand = options.reset_strand for gff in GTF.iterator(options.stdin): ninput += 1 keep = False if by_gene: keep = gff.gene_id in ids if by_transcript: keep = gff.transcript_id in ids if (invert and keep) or (not invert and not keep): continue if reset_strand: gff.strand = "." options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.sample_size: if options.filter == "gene": iterator = GTF.flat_gene_iterator( GTF.iterator(options.stdin)) elif options.filter == "transcript": iterator = GTF.transcript_iterator( GTF.iterator(options.stdin)) if options.min_exons_length: iterator = GTF.iterator_min_feature_length( iterator, min_length=options.min_exons_length, feature="exon") data = [x for x in iterator] ninput = len(data) if len(data) > options.sample_size: data = random.sample(data, options.sample_size) for d in data: noutput += 1 for dd in d: nfeatures += 1 options.stdout.write(str(dd) + "\n") else: assert False, "please supply either a filename " "with ids to filter with (--apply) or a sample-size." elif options.exons2introns: for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") input_ranges = Intervals.combine(cds_ranges + exon_ranges) if len(input_ranges) > 1: last = input_ranges[0][1] output_ranges = [] for start, end in input_ranges[1:]: output_ranges.append((last, start)) last = end if options.intron_border: b = options.intron_border output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges] if options.intron_min_length: l = options.intron_min_length output_ranges = [ x for x in output_ranges if x[1] - x[0] > l] for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "intron" entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 else: ndiscarded += 1 elif options.set_score2distance: for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(gffs[0].strand) all_start, all_end = min([x.start for x in gffs]), max( [x.end for x in gffs]) if strand != ".": t = 0 if strand == "-": gffs.reverse() for gff in gffs: gff.score = t t += gff.end - gff.start if strand == "-": gffs.reverse() for gff in gffs: options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.remove_overlapping: index = GTF.readAndIndex( GTF.iterator(IOTools.openFile(options.remove_overlapping, "r"))) for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 found = False for e in gffs: if index.contains(e.contig, e.start, e.end): found = True break if found: ndiscarded += 1 else: noutput += 1 for e in gffs: nfeatures += 1 options.stdout.write("%s\n" % str(e)) elif options.intersect_transcripts: for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 r = [] for g in gffs: if options.with_utr: ranges = GTF.asRanges(g, "exon") else: ranges = GTF.asRanges(g, "CDS") r.append(ranges) result = r[0] for x in r[1:]: result = Intervals.intersect(result, x) entry = GTF.Entry() entry.copy(gffs[0][0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "exon" for start, end in result: entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 elif options.rename_duplicates: gene_ids = list() transcript_ids = list() gtfs = list() for gtf in GTF.iterator(options.stdin): gtfs.append(gtf) if gtf.feature == "CDS": gene_ids.append(gtf.gene_id) transcript_ids.append(gtf.transcript_id) dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1] dup_transcript = [item for item in set(transcript_ids) if transcript_ids.count(item) > 1] E.info("Number of duplicated gene_ids: %i" % len(dup_gene)) E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript)) gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene)))) transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript)))) for gtf in gtfs: if gtf.feature == "CDS": if gtf.gene_id in dup_gene: gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1 gtf.setAttribute('gene_id', gtf.gene_id + "." + str(gene_dict[gtf.gene_id])) if gtf.transcript_id in dup_transcript: transcript_dict[gtf.transcript_id] = \ transcript_dict[gtf.transcript_id] + 1 gtf.setAttribute('transcript_id', gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id])) options.stdout.write("%s\n" % gtf) else: for gffs in GTF.flat_gene_iterator( GTF.iterator(options.stdin), strict=options.strict): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") # sanity checks strands = set([x.strand for x in gffs]) contigs = set([x.contig for x in gffs]) if len(strands) > 1: raise ValueError("can not merge gene '%s' on multiple strands: %s" % ( gffs[0].gene_id, str(strands))) if len(contigs) > 1: raise ValueError("can not merge gene '%s' on multiple contigs: %s" % ( gffs[0].gene_id, str(contigs))) strand = Genomics.convertStrand(gffs[0].strand) if cds_ranges and options.with_utr: cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1] midpoint = (cds_end - cds_start) / 2 + cds_start utr_ranges = [] for start, end in Intervals.truncate(exon_ranges, cds_ranges): if end - start > 3: if strand == ".": feature = "UTR" elif strand == "+": if start < midpoint: feature = "UTR5" else: feature = "UTR3" elif strand == "-": if start < midpoint: feature = "UTR3" else: feature = "UTR5" utr_ranges.append((feature, start, end)) output_feature = "CDS" output_ranges = cds_ranges else: output_feature = "exon" output_ranges = exon_ranges utr_ranges = [] result = [] if options.merge_exons: # need to combine per feature - skip # utr_ranges = Intervals.combineAtDistance( # utr_ranges, # options.merge_exons_distance) output_ranges = Intervals.combineAtDistance( output_ranges, options.merge_exons_distance) for feature, start, end in utr_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.feature = feature entry.transcript_id = "merged" entry.start = start entry.end = end result.append(entry) for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = output_feature entry.start = start entry.end = end result.append(entry) elif options.merge_transcripts: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][0] entry.end = output_ranges[-1][1] result.append(entry) elif options.merge_introns: if len(output_ranges) >= 2: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][1] entry.end = output_ranges[-1][0] result.append(entry) else: ndiscarded += 1 continue result.sort(key=lambda x: x.start) for x in result: options.stdout.write("%s\n" % str(x)) nfeatures += 1 noutput += 1 E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded)) E.Stop()
def loadSequence(self, sequence): """load sequence properties from a sequence.""" SequenceProperties.loadSequence(self, sequence) # uppercase all letters sequence = sequence.upper() self.mNStopCodons = 0 # setup counting arrays # nucleotide counts for each position (is not a sum of the counts # per degenerate site, as the codon might be intelligible, e.g. GNN). self.mCounts = [{ 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }, { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }, { 'A': 0, 'C': 0, 'G': 0, 'T': 0, 'X': 0, 'N': 0 }] # nucleotide counts for each position per degeneracy self.mCountsDegeneracy = [] for x in (0, 1, 2): xx = [] for y in range(5): yy = {} for z in Bio.Alphabet.IUPAC.extended_dna.letters: yy[z] = 0 xx.append(yy) self.mCountsDegeneracy.append(xx) for codon in [sequence[x:x + 3] for x in range(0, len(sequence), 3)]: for x in (0, 1, 2): self.mCounts[x][codon[x]] += 1 if Genomics.IsStopCodon(codon): self.mNStopCodons += 1 continue try: aa, deg1, deg2, deg3 = Genomics.GetDegeneracy(codon) degrees = (deg1, deg2, deg3) for x in range(len(degrees)): self.mCountsDegeneracy[x][degrees[x]][codon[x]] += 1 except KeyError: pass
def main( argv = None ): parser = E.OptionParser( version = "%prog version: $Id: analyze_codonbias_shannon.py 2864 2010-03-03 10:18:16Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-c", "--is-cds", dest="is_cds", action="store_true", help = "input are cds (nucleotide) sequences [%default]" ) parser.set_defaults( is_cds = False, ) (options, args) = E.Start( parser, argv = argv ) options.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n" ) alphabet = "ACDEFGHIKLMNPQRSTVWY" snpid = 0 for entry in FastaIterator.iterate( options.stdin ): identifier = entry.title if options.is_cds: cds_sequence = entry.sequence.upper() assert len(cds_sequence) % 3 == 0, \ "length of sequence '%s' is not a multiple of 3" % entry.title sequence = Genomics.translate( cds_sequence ) weights = [] for pos, cds_pos in enumerate(range( 0, len(cds_sequence), 3)): codon = cds_sequence[cds_pos:cds_pos+3] counts = collections.defaultdict(int) for x in range(0,3): rna = codon[x] for na in "ACGT": if na == rna: continue taa = Genomics.translate(codon[:x] + na + codon[x+1:]) counts[taa] += 1 weights.append( counts ) else: sequence = entry.sequence.upper() counts = {} for x in alphabet: counts[x] = 1 weights = [counts] * len(sequence) for pos, ref in enumerate( sequence ): if ref not in alphabet: continue w = weights[pos] t = float(sum(w.values())) for variant in alphabet: if variant == ref: continue snpid +=1 options.stdout.write( "%s\n" % "\t".join( ( "%010i" % snpid, identifier, str(pos+1), ref, variant, "%i" % w[variant], "%6.4f" % (w[variant] / t), ))) E.Stop()
def AlignCodonBased(seq_wobble, seq_cds, seq_peptide, map_p2c, options, diag_width=2, max_advance=2): """advance in codons in seq_wobble and match to nucleotides in seq_cds. Due to alinglib this is all in one-based coordinates. Takes care of frameshifts. """ map_p2c.clear() gop, gep = -1.0, -1.0 matrix = alignlib_lite.py_makeSubstitutionMatrixBackTranslation( 1, -10, 1, alignlib_lite.py_getDefaultEncoder()) pep_seq = seq_peptide.asString() cds_seq = seq_cds.asString() wobble_seq = seq_wobble.asString() lcds = seq_cds.getLength() lwobble = seq_wobble.getLength() y = 0 x = 0 last_start = None while x < lwobble and y < lcds: xr = seq_wobble.asResidue(x) # skip over masked chars in wobble - these are gaps if seq_wobble.asChar(x) == "X": x += 1 continue # skip over masked chars in wobble - these are from # masked chars in the peptide sequence # Note to self: do not see all implications of this change # check later. if seq_wobble.asChar(x) == "N": x += 1 continue # skip over gaps in wobble if seq_wobble.asChar(x) == "-": x += 1 continue s = matrix.getValue(xr, seq_cds.asResidue(y)) if options.loglevel >= 6: if (x % 3 == 0): c = seq_cds.asChar(y) + seq_cds.asChar(y + 1) + seq_cds.asChar(y + 2) options.stdlog.write( "# c=%s, x=%i, y=%i, aa=%s target=%s\n" % (c, x, y, Genomics.MapCodon2AA(c), pep_seq[int(x / 3)])) options.stdlog.write( "# x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%s\n" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), str(s))) # deal with mismatches if s <= 0: tmp_map_p2c = alignlib_lite.py_makeAlignmentVector() # backtrack to previous three codons and align # three codons for double frameshifts that span two codons and # produce two X's and six WWWWWW. # number of nucleotides to extend (should be multiple of 3) # less than 12 caused failure for some peptides. d = 15 # extend by amound dx dx = (x % 3) + d x_start = max(0, x - dx) # map to ensure that no ambiguous residue mappings # exist after re-alignment y_start = max(0, map_p2c.mapRowToCol(x_start, alignlib_lite.py_RIGHT)) if (x_start, y_start) == last_start: raise ValueError("infinite loop detected") last_start = (x_start, y_start) x_end = min(x_start + 2 * d, len(wobble_seq)) y_end = min(y_start + 2 * d, len(cds_seq)) wobble_fragment = alignlib_lite.py_makeSequence( wobble_seq[x_start:x_end]) cds_fragment = alignlib_lite.py_makeSequence( cds_seq[y_start:y_end]) AlignExhaustive(wobble_fragment, cds_fragment, "", tmp_map_p2c, options) if options.loglevel >= 10: options.stdlog.write( "# fragmented alignment from %i-%i, %i-%i:\n%s\n" % (x_start, x_end, y_start, y_end, str( alignlib_lite.py_AlignmentFormatExplicit( tmp_map_p2c, wobble_fragment, cds_fragment)))) options.stdlog.flush() # clear alignment map_p2c.removeRowRegion(x_start, x_end) ngap = 0 last_x, last_y = None, None for xxx in range(tmp_map_p2c.getRowFrom(), tmp_map_p2c.getRowTo()): yyy = tmp_map_p2c.mapRowToCol(xxx) if yyy >= 0: x = xxx + x_start y = yyy + y_start xr = seq_wobble.asResidue(x) s = matrix.getValue(seq_wobble.asResidue(x), seq_cds.asResidue(y)) if s < 0: raise ValueError( "mismatched residue wobble: %i (%s), cds: %i (%s)" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y))) map_p2c.addPair(x, y, s) last_x, last_y = x, y if options.loglevel >= 6: options.stdlog.write( "# reset: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % (x, seq_wobble.asChar(x), y, seq_cds.asChar(y), xr, seq_cds.asResidue(y), s)) options.stdlog.flush() ngap = 0 else: ngap += 1 # treat special case of double frameshifts. They might cause a petide/wobble residue # to be eliminated and thus the translated sequences will differ. # simply delete the last residue between x and y and move to # next codon. if ngap == 3: map_p2c.removeRowRegion(last_x, last_x + 1) last_x += 1 map_p2c.addPair(last_x, last_y) if options.loglevel >= 6: options.stdlog.write( "# double: x=%i\twob=%s\ty=%i\tcds=%s\txr=%s\tcds=%i\tscore=%i\n" % (last_x, seq_wobble.asChar(last_x), last_y, seq_cds.asChar(last_y), xr, seq_cds.asResidue(last_y), s)) options.stdlog.flush() ngap = 0 # exit condition if alignment is shorter than problematic residue # need to catch this to avoid infinite loop. if tmp_map_p2c.getRowTo() < d: if lwobble - x <= 4: # only last codon is missing, so ok break else: raise ValueError("failure to align in designated window.") s = 0 s = matrix.getValue(xr, seq_cds.asResidue(y)) if s < 0: raise ValueError("mis-matching residues.") map_p2c.addPair(x, y, float(s)) # advance to next residues x += 1 y += 1 # sanity checks assert (map_p2c.getRowTo() <= seq_wobble.getLength()) assert (map_p2c.getColTo() <= seq_cds.getLength())
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: run_nubiscan.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-i", "--iterations", dest="iterations", type="int", help="number of iterations for sampling [default=%default]." ) parser.add_option("-q", "--qvalue", dest="qvalue_threshold", type="float", help="qvalue threshold [default=%default]." ) parser.add_option("--without-combine", dest="combine", action = "store_false", help="combine overlapping motifs [default=%default]." ) parser.add_option("-f", "--fdr-control", dest="fdr_control", type="choice", choices = ("per-sequence", "all", "xall"), help="qvalue threshold [default=%default]." ) parser.add_option("-m", "--motif", dest="motif", type="choice", choices=("rxrvdr", "rxrvdr1", "rxrvdr2", "nr"), help="qvalue threshold [default=%default]." ) parser.add_option("-a", "--arrangements", dest="arrangements", type="string", help ="',' separated list of repeat arrangements [default=%default]") parser.add_option("-x", "--mask", dest="mask", type="choice", choices=("dust","repeatmasker"), help ="mask sequences before scanning [default=%default]") parser.add_option("--output-stats", dest="output_stats", action = "store_true", help="output stats [default=%default]." ) parser.add_option("--add-sequence", dest="add_sequence", action = "store_true", help="add sequence information [default=%default]." ) parser.set_defaults( iterations = 100, qvalue_threshold = 0.05, motif = "rxrvdr", fdr_control = "all", combine = True, arrangements = None, mask = None, output_stats = False, add_sequence = False, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv, add_output_options = True ) ## do sth ninput, nskipped, noutput = 0, 0, 0 if options.arrangements == None: options.arrangements = [ "DR%s" % x for x in range(0,15) ] + [ "ER%s" % x for x in range(0,15) ] else: options.arrangements = options.arrangements.split(",") options.stdout.write( "%s" % "\t".join(Nubiscan.NubiscanMatch._fields) ) if options.add_sequence: options.stdout.write( "\tsequence" ) options.stdout.write("\n") if options.motif == 'nr': sense_matrix = NR elif options.motif == "rxrvdr": sense_matrix = RXRVDR elif options.motif == "rxrvdr1": sense_matrix = RXRVDR1 elif options.motif == "rxrvdr2": sense_matrix = RXRVDR2 else: raise ValueError("unknown matrix %s" % options.motif) if options.fdr_control == "all": seqs = list(FastaIterator.iterate(options.stdin)) if options.mask: masked_seqs = maskSequences( [x.sequence for x in seqs], options.mask ) else: masked_seqs = [x.sequence for x in seqs] ninput = len(seqs) map_id2title = dict( enumerate( [re.sub("\s.*", "", x.title) for x in seqs] ) ) matcher = Nubiscan.MatcherRandomisationSequences( sense_matrix, samples = options.iterations ) results = matcher.run( masked_seqs, options.arrangements, qvalue_threshold = options.qvalue_threshold ) if options.combine: results = Nubiscan.combineMotifs( results ) for r in results: if r.alternatives: alternatives = ",".join( [x.arrangement for x in r.alternatives ] ) else: alternatives = "" options.stdout.write( "\t".join( ( map_id2title[r.id], "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue, alternatives) ) ) if options.add_sequence: s = masked_seqs[int(r.id)][r.start:r.end] if r.strand == "-": s = Genomics.complement( s ) s = s[:6].upper() + s[6:-6].lower() + s[-6:].upper() options.stdout.write( "\t%s" % s ) options.stdout.write("\n") noutput += 1 # output stats if options.output_stats: outfile = E.openOutputFile( "fdr" ) outfile.write("bin\thist\tnobserved\n" ) for bin, hist, nobs in zip(matcher.bin_edges, matcher.hist, matcher.nobservations): outfile.write( "%f\t%f\t%f\n" % (bin, hist, nobs)) outfile.close() elif options.fdr_control == "xall": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples = options.iterations ) # collect all results matches = [] for seq in FastaIterator.iterate(options.stdin): ninput += 1 mm = matcher.run( seq.sequence, options.arrangements, qvalue_threshold = None ) for m in mm: matches.append( m._replace( sequence = seq.title ) ) # estimate qvalues for all matches across all sequences pvalues = [ x.pvalue for x in matches ] fdr = Stats.doFDR( pvalues ) qvalues = fdr.mQValues results = [] for m, qvalue in zip(matches, qvalues): if qvalue > options.qvalue_threshold: continue results.append( m._replace( qvalue = qvalue ) ) if options.combine: results = Nubiscan.combineMotifs( results ) # output for r in results: options.stdout.write( "\t".join( ( r.id, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%6.4e" % r.pvalue, "%6.4e" % r.qvalue ) ) + "\n" ) noutput += 1 elif options.fdr_control == "per-sequence": matcher = Nubiscan.MatcherRandomisationSequence( sense_matrix, samples = options.iterations ) for seq in FastaIterator.iterate(options.stdin): ninput += 1 result = matcher.run( seq.sequence, options.arrangements, qvalue_threshold = options.qvalue_threshold ) if options.combine: result = Nubiscan.combineMotifs( result ) t = re.sub(" .*","", seq.title) for r in result: options.stdout.write( "\t".join( ( t, "%i" % r.start, "%i" % r.end, r.strand, r.arrangement, "%6.4f" % r.score, "%6.4f" % r.zscore, "%f" % r.pvalue, "%f" % r.qvalue ) ) + "\n" ) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) ) ## write footer and output benchmark information. E.Stop()
def updateVariants(variants, lcontig, strand, phased=True): '''update variants such that they use same coordinate system (and strand) as the transcript fixes 1-ness of variants ''' new_variants = [] is_positive = Genomics.IsPositiveStrand(strand) for variant in variants: pos = variant.pos genotype = bytes(variant.genotype) reference = bytes(variant.reference) # fix 1-ness of variants # pos -= 1 if len(genotype) == 1: variantseqs = list(Genomics.decodeGenotype(genotype)) has_wildtype = reference in variantseqs action = "=" start, end = pos, pos + 1 else: variantseqs = [x[1:] for x in genotype.split("/")] lvariant = max([len(x) for x in variantseqs]) if not phased: variantseqs = [x for x in variantseqs if x] has_wildtype = "*" in genotype if "+" in genotype and "-" in genotype: # both insertion and deletion at position # the range is given by the deletion # see below for explanations if genotype.startswith("+"): action = ">" variantseqs[1] += "-" * (lvariant - len(variantseqs[1])) else: action = "<" variantseqs[0] += "-" * (lvariant - len(variantseqs[0])) start, end = pos + 1, pos + lvariant + 1 elif "-" in genotype: action = "-" # samtools: deletions are after the base denoted by snp.position # * <- deletion at 1 # 0 1 2 3 4 5 6 # - - # 6 5 4 3 2 1 0 # deletion of 2+3 = (2,4) # on reverse: (7-4, 7-2) = (3,5) start, end = pos + 1, pos + lvariant + 1 # deletions of unequal length are filled up with "-" # This is necessary to deal with negative strands: # -at/-atg on the positive strand deletes a t [g] # -at/-atg on the negative strand deletes [g] t a variantseqs = [ x + "-" * (lvariant - len(x)) for x in variantseqs ] elif "+" in genotype: action = "+" # indels are after the base denoted by position # as region use both flanking base so that negative strand # coordinates work # insertion between position 2 and 3 # * <- insection at pos 2 # 0 1 2i3 4 # 4 3 2i1 0 # is insertion between 1 and 2 in reverse # including both flanking residues makes it work: # (2,3) = (5-3,5-2) = (2,3) # but: # (2,4) = (5-4,5-2) = (1,3) start, end = pos, pos + 2 # revert strand if not is_positive: reference = Genomics.complement(reference) variantseqs = [Genomics.complement(x.upper()) for x in variantseqs] start, end = lcontig - end, lcontig - start new_variants.append( ExtendedVariant._make((start, end, reference.upper(), action, has_wildtype, variantseqs))) return new_variants
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: snp2maf.py 2875 2010-03-27 17:42:04Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-t", "--tracks", dest="tracks", type="string", action="append", help="tracks (tablenames) to use in sqlite database [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option("-r", "--reference", dest="reference", type="string", help="name of reference [default=%default].") parser.add_option("-i", "--is-gtf", dest="is_gtf", action="store_true", help="if set, the gene_id will be added to the alignment header [default=%default].") parser.add_option("-z", "--compress", dest="compress", action="store_true", help="compress output with gzip [default=%default].") parser.add_option("-p", "--pattern-identifier", dest="pattern_track", type="string", help="regular expression pattern for track [default=%default].") parser.set_defaults( genome_file=None, tracks=[], database="csvdb", output=[], border=0, reference_name="reference", pattern_track="(\S+)", is_gtf=True, compress=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if not options.database or not options.tracks: raise ValueError("please supply both database and tracks") if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.is_gtf: infile_gff = GTF.iterator(options.stdin) else: infile_gff = GTF.iterator(options.stdin) dbhandle = sqlite3.connect(options.database) statement = '''SELECT pos, reference, genotype FROM %(track)s WHERE contig = '%(contig)s' AND pos BETWEEN %(extended_start)s and %(extended_end)s ''' counts = E.Counter() tracks = options.tracks try: translated_tracks = [ re.search(options.pattern_track, track).groups()[0] for track in tracks] except AttributeError: raise AttributeError( "pattern `%s` does not match input tracks." % options.pattern_track) if options.compress: outfile = gzip.GzipFile(fileobj=options.stdout) else: outfile = options.stdout outfile.flush() outfile.write("##maf version=1 program=snp2maf.py\n\n") for gff in infile_gff: counts.input += 1 contig = gff.contig strand = gff.strand lcontig = fasta.getLength(contig) region_start, region_end = gff.start, gff.end if contig.startswith("chr"): contig = contig[3:] extended_start = region_start - options.border extended_end = region_end + options.border is_positive = Genomics.IsPositiveStrand(strand) E.info("processing %s" % str(gff)) # collect all variants all_variants = [] for track in options.tracks: cc = dbhandle.cursor() cc.execute(statement % locals()) all_variants.append(map(Variants.Variant._make, cc.fetchall())) cc.close() E.debug("%s:%i..%i collected %i variants for %i tracks" % (contig, region_start, region_end, sum([ len(x) for x in all_variants]), len(all_variants))) reference_seq = fasta.getSequence( contig, "+", region_start, region_end) lseq = len(reference_seq) alleles = collections.defaultdict(list) # build allele sequences for track and count maximum chars per mali # column colcounts = numpy.ones(lseq) for track, variants in zip(translated_tracks, all_variants): variants = Variants.updateVariants(variants, lcontig, "+") a = Variants.buildAlleles(reference_seq, variants, reference_start=region_start) alleles[track] = a for allele in a: for pos, c in enumerate(allele): colcounts[pos] = max(colcounts[pos], len(c)) # realign gapped regions alignIndels(alleles, colcounts) if options.is_gtf: outfile.write("a gene_id=%s\n" % gff.gene_id) else: outfile.write("a\n") maf_format = "s %(name)-30s %(pos)9i %(size)6i %(strand)s %(lcontig)9i %(seq)s\n" def __addGaps(sequence, colcounts): '''output gapped sequence.''' r = [] for x, c in enumerate(sequence): r.append(c + "-" * (colcounts[x] - len(c))) return "".join(r) name = ".".join((options.reference, contig)) if is_positive: pos = region_start else: pos = lcontig - region_start size = lseq seq = __addGaps(reference_seq, colcounts) outfile.write(maf_format % (locals())) for track in translated_tracks: for aid, allele in enumerate(alleles[track]): seq = __addGaps(allele, colcounts) if not is_positive: Genomics.complement(seq) size = len(seq) - seq.count("-") name = ".".join((track + "-%i" % aid, contig)) outfile.write(maf_format % (locals())) outfile.write("\n") E.info("%s" % str(counts)) # write footer and output benchmark information. E.Stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-c", "--is-cds", dest="is_cds", action="store_true", help="input are cds (nucleotide) sequences [%default]") parser.set_defaults(is_cds=False, ) (options, args) = E.Start(parser, argv=argv) options.stdout.write( "snpid\tidentifier\tpos\treference\tvariant\tcounts\tweight\n") alphabet = "ACDEFGHIKLMNPQRSTVWY" snpid = 0 for entry in FastaIterator.iterate(options.stdin): identifier = entry.title if options.is_cds: cds_sequence = entry.sequence.upper() assert len(cds_sequence) % 3 == 0, \ "length of sequence '%s' is not a multiple of 3" % entry.title sequence = Genomics.translate(cds_sequence) weights = [] for pos, cds_pos in enumerate(range(0, len(cds_sequence), 3)): codon = cds_sequence[cds_pos:cds_pos + 3] counts = collections.defaultdict(int) for x in range(0, 3): rna = codon[x] for na in "ACGT": if na == rna: continue taa = Genomics.translate(codon[:x] + na + codon[x + 1:]) counts[taa] += 1 weights.append(counts) else: sequence = entry.sequence.upper() counts = {} for x in alphabet: counts[x] = 1 weights = [counts] * len(sequence) for pos, ref in enumerate(sequence): if ref not in alphabet: continue w = weights[pos] t = float(sum(w.values())) for variant in alphabet: if variant == ref: continue snpid += 1 options.stdout.write("%s\n" % "\t".join(( "%010i" % snpid, identifier, str(pos + 1), ref, variant, "%i" % w[variant], "%6.4f" % (w[variant] / t), ))) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: gff2gff.py$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=( "add-flank", "add-upstream-flank", "add-downstream-flank", "crop", "crop-unique", "complement-groups", "combine-groups", "filter-range", "join-features", "merge-features", "sanitize", "to-forward-coordinates", "to-forward-strand"), help="method to apply [%default]") parser.add_option( "--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf [default=%default].") parser.add_option( "-c", "--contigs-tsv-file", dest="input_filename_contigs", type="string", help="filename with contig lengths.") parser.add_option( "--agp-file", dest="input_filename_agp", type="string", help="agp file to map coordinates from contigs to scaffolds.") parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option( "--crop-gff-file", dest="filename_crop_gff", type="string", help="GFF/GTF file to crop against.") parser.add_option( "--group-field", dest="group_field", type="string", help="""gff field/attribute to group by such as gene_id, " "transcript_id, ... [%default].""") parser.add_option( "--filter-range", dest="filter_range", type="string", help="extract all elements overlapping a range. A range is " "specified by eithor 'contig:from..to', 'contig:+:from..to', " "or 'from,to' .") parser.add_option( "--sanitize-method", dest="sanitize_method", type="choice", choices=("ucsc", "ensembl", "genome"), help="method to use for sanitizing chromosome names. " "[%default].") parser.add_option( "--flank-method", dest="flank_method", type="choice", choices=("add", "extend"), help="method to use for adding flanks. ``extend`` will " "extend existing features, while ``add`` will add new features. " "[%default].") parser.add_option( "--skip-missing", dest="skip_missing", action="store_true", help="skip entries on missing contigs. Otherwise an " "exception is raised [%default].") parser.add_option( "--contig-pattern", dest="contig_pattern", type="string", help="a comma separated list of regular expressions specifying " "contigs to be removed when running method sanitize [%default].") parser.add_option( "--extension-upstream", dest="extension_upstream", type="float", help="extension for upstream end [%default].") parser.add_option( "--extension-downstream", dest="extension_downstream", type="float", help="extension for downstream end [%default].") parser.add_option( "--min-distance", dest="min_distance", type="int", help="minimum distance of features to merge/join [%default].") parser.add_option( "--max-distance", dest="max_distance", type="int", help="maximum distance of features to merge/join [%default].") parser.add_option( "--min-features", dest="min_features", type="int", help="minimum number of features to merge/join [%default].") parser.add_option( "--max-features", dest="max_features", type="int", help="maximum number of features to merge/join [%default].") parser.set_defaults( input_filename_contigs=False, filename_crop_gff=None, input_filename_agp=False, genome_file=None, add_up_flank=None, add_down_flank=None, complement_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, min_distance=0, max_distance=0, min_features=1, max_features=0, extension_upstream=1000, extension_downstream=1000, sanitize_method="ucsc", flank_method="add", output_format="%06i", skip_missing=False, is_gtf=False, group_field=None, contig_pattern=None, ) (options, args) = E.Start(parser, argv=argv) contigs = None genome_fasta = None if options.input_filename_contigs: contigs = Genomics.readContigSizes( IOTools.openFile(options.input_filename_contigs, "r")) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) contigs = genome_fasta.getContigSizes() if options.method in ("forward_coordinates", "forward_strand", "add-flank", "add-upstream-flank", "add-downstream-flank") \ and not contigs: raise ValueError("inverting coordinates requires genome file") if options.input_filename_agp: agp = AGP.AGP() agp.readFromFile(IOTools.openFile(options.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(options.stdin) if options.method in ("add-upstream-flank", "add-downstream-flank", "add-flank"): add_upstream_flank = "add-upstream-flank" == options.method add_downstream_flank = "add-downstream-flank" == options.method if options.method == "add-flank": add_upstream_flank = add_downstream_flank = True upstream_flank = int(options.extension_upstream) downstream_flank = int(options.extension_downstream) extend_flank = options.flank_method == "extend" if options.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, options.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(key=lambda x: (x.contig, x.start)) lcontig = contigs[chunk[0].contig] if extend_flank: if add_upstream_flank: if is_positive: chunk[0].start = max( 0, chunk[0].start - upstream_flank) else: chunk[-1].end = min( lcontig, chunk[-1].end + upstream_flank) if add_downstream_flank: if is_positive: chunk[-1].end = min(lcontig, chunk[-1].end + downstream_flank) else: chunk[0].start = max( 0, chunk[0].start - downstream_flank) else: if add_upstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - upstream_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + upstream_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if add_downstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + downstream_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - downstream_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: options.stdout.write(str(gff) + "\n") elif options.method == "complement-groups": iterator = GTF.joined_iterator(gffs, group_field=options.group_field) for chunk in iterator: if options.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start options.stdout.write(str(x) + "\n") x.start = c.end elif options.method == "combine-groups": iterator = GTF.joined_iterator(gffs, group_field=options.group_field) for chunk in iterator: chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" options.stdout.write(str(x) + "\n") elif options.method == "join-features": for gff in combineGFF(gffs, min_distance=options.min_distance, max_distance=options.max_distance, min_features=options.min_features, max_features=options.max_features, merge=False, output_format=options.output_format): options.stdout.write(str(gff) + "\n") elif options.method == "merge-features": for gff in combineGFF(gffs, min_distance=options.min_distance, max_distance=options.max_distance, min_features=options.min_features, max_features=options.max_features, merge=True, output_format=options.output_format): options.stdout.write(str(gff) + "\n") elif options.method == "crop": for gff in cropGFF(gffs, options.filename_crop_gff): options.stdout.write(str(gff) + "\n") elif options.method == "crop-unique": for gff in cropGFFUnique(gffs): options.stdout.write(str(gff) + "\n") elif options.method == "filter-range": contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match( "(\S+):(\d+)(\.\.|-)(\d+)", options.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match( "(\d+)(\.\.|\,|\-)(\d+)", options.filter_range).groups() except AttributeError: raise "can not parse range %s" % options.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None E.debug("filter: contig=%s, strand=%s, interval=%s" % (str(contig), str(strand), str(interval))) for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): options.stdout.write(str(gff) + "\n") elif options.method == "sanitize": def toUCSC(id): if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id return id def toEnsembl(id): if id.startswith("contig"): return id[len("contig"):] if id.startswith("chr"): return id[len("chr"):] return id if options.sanitize_method == "genome": if genome_fasta is None: raise ValueError( "please specify --genome-file= when using " "--sanitize-method=genome") f = genome_fasta.getToken elif options.sanitize_method == "ucsc": f = toUCSC elif options.sanitize_method == "ensembl": f = toEnsembl skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError: if options.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if options.contig_pattern: to_remove = [re.compile(x) for x in options.contig_pattern.split(",")] if any([x.search(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue options.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len(list(skipped_contigs.keys( ))), str(skipped_contigs))) if outofrange_contigs: E.warn("skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(list(outofrange_contigs.keys())), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len(list(filtered_contigs.keys())), str(filtered_contigs))) else: for gff in gffs: if options.method == "forward_coordinates": gff.invert(contigs[gff.contig]) if options.method == "forward_strand": gff.invert(contigs[gff.contig]) gff.strand = "+" if agp: # note: this works only with forward coordinates gff.contig, gff.start, gff.end = agp.mapLocation( gff.contig, gff.start, gff.end) options.stdout.write(str(gff) + "\n") E.Stop()