def readIntervals(infile, options): ninput = 0 t = time.time() if options.format == "gtf": index = IndexedGenome.IndexedGenome() for gffs in GTF.transcript_iterator(GTF.iterator(infile)): ali = alignlib_lite.py_makeAlignmentBlocks() for gff in gffs: if gff.feature != "exon": continue ali.addDiagonal(gff.start, gff.end, 0) index.add(min([x.start for x in gffs]), max([x.end for x in gffs]), ali) ninput += 1 if ninput % options.report_step == 0: E.info( "reading intervals - progress: ninput=%i, time=%i, avg=%f" % (ninput, time.time() - t, float(time.time() - t) / ninput)) elif options.format == "gff": index = IndexedGenome.Simple() for g in GTF.iterator(infile): index.add(g.contig, g.start, g.end) ninput += 1 if ninput % options.report_step == 0: E.info( "reading intervals - progress: ninput=%i, time=%i, avg=%f" % (ninput, time.time() - t, float(time.time() - t) / ninput)) E.info("read intervals: %i contigs, %i intervals" % (len(index), ninput)) return index
def getMapTarget2Query(self): """return a map between target to query. If the strand is "-", the coordinates for query are on the negative strand. """ map_target2query = alignlib_lite.py_makeAlignmentBlocks() f = alignlib_lite.py_AlignmentFormatBlat( "%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % (min(self.mSbjctBlockStarts), max(self.mSbjctBlockStarts), min(self.mQueryBlockStarts), max(self.mQueryBlockStarts), ",".join([str(x) for x in self.mSbjctBlockStarts]) + ",", ",".join([str(x) for x in self.mQueryBlockStarts]) + ",", ",".join([str(x) for x in self.mBlockSizes]) + ",")) f.copy(map_target2query) return map_target2query
def getMapTarget2Query(self): """return a map between target to query. If the strand is "-", the coordinates for query are on the negative strand. """ map_target2query = alignlib_lite.py_makeAlignmentBlocks() f = alignlib_lite.py_AlignmentFormatBlat("%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % ( min(self.mSbjctBlockStarts), max(self.mSbjctBlockStarts), min(self.mQueryBlockStarts), max(self.mQueryBlockStarts), ",".join([str(x) for x in self.mSbjctBlockStarts]) + ",", ",".join([str(x) for x in self.mQueryBlockStarts]) + ",", ",".join([str(x) for x in self.mBlockSizes]) + ",")) f.copy(map_target2query) return map_target2query
def fromPair(self, query_start, query_size, query_strand, query_seq, target_start, target_size, target_strand, target_seq): '''fill from two aligned sequences. Note that sequences are case-sensitive.''' self.mQueryLength = query_size self.mSbjctLength = target_size map_query2target = alignlib_lite.py_makeAlignmentBlocks() assert len(query_seq) == len(target_seq) x, y = query_start, target_start nmatches, nmismatches = 0, 0 for q, t in zip(query_seq, target_seq): tq, tt = q != "-", t != "-" if tq and tt: map_query2target.addPair(x, y) if q == t: nmatches += 1 else: nmismatches += 1 if tq: x += 1 if tt: y += 1 self.mNMatches, self.mNMismatches = nmatches, nmismatches self.strand = query_strand # the following call will set query_from, query_to for the forward strand # though block coordinates might be on the negative strand self.fromMap(map_query2target, use_strand=True) # if target is on negative strand, swop strands if target_strand == "-": # swap target strand - this will also swap the query strand self.switchTargetStrand()
def _buildAllele(allele_id, transcript, exons, introns, offsets, virtual_coordinates=False, reference_exons=None): def _getOffset(pos, offsets): x = 0 while x < len(offsets) and offsets[x][0] <= pos: x += 1 x -= 1 if x >= 0: return offsets[x][1] else: return 0 def _sumIndels(ss): '''sum indels within ss''' c = 0 for s in ss: c += len(s) - 1 return c def _getEndOffsets(ss): '''get the offset at exons due to deletions at start/end of exon.''' l = len(ss) x = 0 while x < l and ss[x] == "": x += 1 start_offset = x x = l - 1 while x >= 0 and ss[x] == "": x -= 1 if x >= 0: return start_offset, (l - 1) - x else: return start_offset, 0 def _addCds2Reference(map_cds2reference, cds_start, cds_seq, reference_start): '''add cds to reference''' c, r = cds_start, reference_start for x in cds_seq: l = len(x) if l == 0: r += 1 else: map_cds2reference.addPair(c, r) c += l r += 1 # counts is_splice_truncated = False is_nmd_knockout = False is_stop_truncated = False nuncorrected_frameshifts = 0 ncorrected_frameshifts = 0 nframeshifts = 0 nsplice_noncanonical = 0 reference_first_stop_start = -1 reference_first_stop_end = -1 # map between the new cds sequence and the reference # sequence map_cds2reference = alignlib_lite.py_makeAlignmentBlocks() ################################################### # process first exon exon = transcript[0] transcript_id = exon.transcript_id # collect offset for exon.start genome_start = exon.start genome_start += _getOffset(genome_start, offsets) lcds, cds = 0, [] cds_starts = [0] # still need to deal with deletions of first base: exon_starts = [genome_start] exon_key = (exon.start, exon.end) exon_sequence = exons[exon_key] exon_seq = "".join(exon_sequence) cds.append(exon_seq) _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start) lcds = len(exon_seq) if len(exon_seq) != exon.end - exon.start: nframeshifts += 1 # add first exon to genome position genome_pos = genome_start + len(exon_seq) last_end = exon.end # correct for deletions at start/end of exon start_offset, end_offset = _getEndOffsets(exon_sequence) # length of original transcript loriginal = sum([x.end - x.start for x in transcript]) if E.global_options.loglevel >= 8: print "%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end) for x, c in enumerate(exons[exon_key]): if len(c) != 1: print x + exon.start, ":%s:" % c print print exons[exon_key] print "genome_pos=", genome_pos, \ ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), \ ", len(exon_seq)=", len(exon_seq), \ ", len(exon)=", exon.end - exon.start, \ ", offsets=%i,%i," % (start_offset, end_offset), \ ", offset at start=", _getOffset( exon.start, offsets), \ ", offset at end=", _getOffset(exon.end, offsets) for exon in transcript[1:]: last_exon_sequence = exon_sequence last_start_offset, last_end_offset = start_offset, end_offset # get the next intron/exon parameters exon_key = (exon.start, exon.end) exon_sequence = exons[exon_key] start_offset, end_offset = _getEndOffsets(exon_sequence) intron_key = (last_end, exon.start) if last_end == exon.start: # catch empty introns intron_sequence = [] intron_key = None else: intron_sequence = introns[intron_key] intron_seq = "".join(intron_sequence) ################################################### ################################################### ################################################### # add preceding intron new_exon = True if len(intron_seq) > frameshiftsize: intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType( intron_seq) if intron_name == "unknown": if intron_seq[:2].islower() and intron_seq[-2:].islower(): E.debug("%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s" % (transcript_id, intron_name, intron_seq5, intron_seq3)) nsplice_noncanonical += 1 else: is_splice_truncated = True E.debug("%s: transcript has splice truncated allele: %s: %s:%s" % (transcript_id, intron_name, intron_seq5, intron_seq3)) break # start a new exon cds_starts.append(lcds) else: # treat as frameshifting intron # # frame-shifting introns are checked if they are # fixed by indels either in the intron itself or # the terminal exon sequence. To this end, the effective # size of the intron is computed: # effective size of intron = # indels at terminal x bases at previous exon # + size of intron # + indels at terminal x bases at next exon effective_intron_size = len(intron_seq) previous_indels = _sumIndels( last_exon_sequence[max(0, -frameshiftsize):]) next_indels = _sumIndels(exon_sequence[:frameshiftsize]) effective_intron_size += previous_indels + next_indels if previous_indels + next_indels == 0 and len(intron_seq) % 3 == 0: has_stop = "X" in Genomics.translate(intron_seq.upper(), is_seleno=is_seleno) else: has_stop = False if effective_intron_size % 3 == 0 and not has_stop: E.debug("%s: fixed frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i)" % (transcript_id, last_end, exon.start, effective_intron_size, len(intron_seq), previous_indels, next_indels,)) # add to previous exon cds.append(intron_seq) lcds += len(intron_seq) ncorrected_frameshifts += 1 new_exon = False else: E.debug("%s: could not fix frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i, has_stop=%i)" % (transcript_id, last_end, exon.start, effective_intron_size, len(intron_seq), previous_indels, next_indels, has_stop)) nuncorrected_frameshifts += 1 # start a new exon cds_starts.append(lcds) if E.global_options.loglevel >= 8: print "%i: intron_indels (%i-%i):" % (allele_id, last_end, exon.start) if intron_key: for x, c in enumerate(introns[intron_key]): if len(c) != 1: print x + last_end, ":%s:" % c print print introns[intron_key] print "genome_pos=", genome_pos, \ ",intron=%i-%i" % (genome_pos, genome_pos + len(intron_seq)), \ ", len(intron_seq)=", len(intron_seq), \ ", len(intron)=", exon.start - last_end, \ ", offset at start=", _getOffset( last_end, offsets), \ ", offset at end=", _getOffset(exon.start, offsets) else: print "empty intron" genome_pos += len(intron_seq) # assertion - check if genomic coordinate of intron is consistent # with offset test_offset = _getOffset(exon.start, offsets) is_offset = genome_pos - exon.start assert is_offset == test_offset, "intron offset difference: %i != %i" % ( is_offset, test_offset) ################################################### ################################################### ################################################### # add the exon exon_seq = "".join(exon_sequence) cds.append(exon_seq) if len(exon_seq) != exon.end - exon.start: nframeshifts += 1 if new_exon: if reference_coordinates: exon_starts.append(exon.start + start_offset) else: exon_starts.append(genome_pos) _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start) lcds += len(exon_seq) last_end = exon.end if E.global_options.loglevel >= 8: print "%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end) for x, c in enumerate(exons[exon_key]): if len(c) != 1: print x + exon.start, ":%s:" % c print print exons[exon_key] print "genome_pos=", genome_pos, \ ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), \ ", len(exon_seq)=", len(exon_seq), \ ", len(exon)=", exon.end - exon.start, \ ", offsets=%i,%i," % (start_offset, end_offset), \ ", offset at start=", _getOffset( exon.start, offsets), \ ", offset at end=", _getOffset(exon.end, offsets) genome_pos += len(exon_seq) test_offset = _getOffset(exon.end, offsets) is_offset = genome_pos - exon.end assert is_offset == test_offset, "exon offset difference: %i != %i" % ( is_offset, test_offset) cds = "".join(cds) assert lcds == len(cds) # fix incomplete codons at the end of the sequence if lcds % 3 != 0: offset = lcds % 3 cds = cds[:-offset] # add frame correction for transcripts that do not start at frame=0 start_frame = (3 - (int(transcript[0].frame) % 3)) % 3 # n are ignored (? in sequence to deal with genes like Muc2) peptide = Genomics.translate("n" * start_frame + cds, is_seleno=is_seleno, prefer_lowercase=False, ignore_n=True) # find the first stop codon if start_frame != 0: # ignore first, potentially incomplete base pep_first_stop = peptide.upper().find("X", 1) else: pep_first_stop = peptide.upper().find("X") E.debug("%s: translated peptide = %s, first stop at %i" % (transcript_id, peptide, pep_first_stop)) peptide = peptide.replace("?", "x") if E.global_options.loglevel >= 8: E.debug("peptide=%s" % peptide) E.debug("cds=%s" % cds) E.debug("%s: start_frame=%i, first stop at %i/%i" % (transcript_id, start_frame, pep_first_stop, len(peptide))) lpeptide, lcds = len(peptide), len(cds) # check for non-sense mediated decay if pep_first_stop != -1: cds_first_stop = pep_first_stop * 3 - start_frame if cds_first_stop < cds_starts[-1]: if ncorrected_frameshifts or nuncorrected_frameshifts: E.warn("nmd knockout transcript %s has frameshifts: %i corrected, %i uncorrected" % (transcript_id, ncorrected_frameshifts, nuncorrected_frameshifts)) is_nmd_knockout = True cds = peptide = "" lpeptide, lcds = 0, 0 reference_first_stop_start, reference_first_stop_end = \ (map_cds2reference.mapRowToCol(cds_first_stop), map_cds2reference.mapRowToCol(cds_first_stop + 3)) elif pep_first_stop < len(peptide) - 1: is_stop_truncated = True cds = cds[:cds_first_stop] peptide[:pep_first_stop] lpeptide, lcds = len(peptide), len(cds) reference_first_stop_start, reference_first_stop_end = \ (map_cds2reference.mapRowToCol(cds_first_stop), map_cds2reference.mapRowToCol(cds_first_stop + 3)) else: E.warn("first stop at %i(cds=%i) ignored: last exon start at %i" % (pep_first_stop, cds_first_stop, cds_starts[-1])) else: # -1 for no stop codon found pep_first_stop = -1 cds_first_stop = -1 lpeptide, lcds = len(peptide), len(cds) if peptide is None and nframeshifts == 0: E.warn( "transcript %s is knockout, though there are no indels - must be nonsense mutation" % (transcript_id)) # build frames frames = [start_frame] start = start_frame l = 0 for end in cds_starts[1:]: l += end - start frames.append((3 - l % 3) % 3) start = end return Allele._make((cds, peptide, len(cds_starts), cds_starts, exon_starts, frames, is_nmd_knockout, is_splice_truncated, is_stop_truncated, nframeshifts, ncorrected_frameshifts, nuncorrected_frameshifts, pep_first_stop, lpeptide, cds_first_stop, lcds, reference_first_stop_start, reference_first_stop_end, loriginal, nsplice_noncanonical, )), map_cds2reference
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.add_option("--no-header", dest="with_header", action="store_false", help="do not output BLAT header [default=%default].") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--input-filename-queries", dest="input_filename_queries", type="string", help="fasta filename with queries [default=%default].") parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true", help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default].""" ) parser.set_defaults(is_gtf=False, genome_file=None, with_header=True, allow_duplicates=False, test=None) (options, args) = E.Start(parser, add_pipe_options=True) if options.genome_file: genome_fasta = IndexedFasta.IndexedFasta(options.genome_file) else: genome_fasta = None if options.input_filename_queries: queries_fasta = IndexedFasta.IndexedFasta( options.input_filename_queries) else: queries_fasta = None ninput, noutput, nskipped = 0, 0, 0 if options.is_gtf: iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin), feature="exon"), strict=not options.allow_duplicates) else: iterator = GTF.joined_iterator(GTF.iterator(sys.stdin)) if options.with_header: options.stdout.write(Blat.Match().getHeader() + "\n") for gffs in iterator: if options.test and ninput >= options.test: break ninput += 1 result = alignlib_lite.py_makeAlignmentBlocks() xstart = 0 intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs]) for start, end in intervals: xend = xstart + end - start result.addDiagonal(xstart, xend, start - xstart) xstart = xend entry = Blat.Match() entry.mQueryId = gff.transcript_id entry.mSbjctId = gff.contig entry.strand = gff.strand if genome_fasta: if entry.mSbjctId in genome_fasta: entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId) else: entry.mSbjctLength = result.getColTo() if queries_fasta: if entry.mQueryId in queries_fasta: entry.mQueryLength = queries_fasta.getLength(entry.mQueryId) else: entry.mQueryLength = result.getRowTo() entry.fromMap(result) options.stdout.write(str(entry) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def _buildAllele(allele_id, transcript, exons, introns, offsets, virtual_coordinates=False, reference_exons=None): def _getOffset(pos, offsets): x = 0 while x < len(offsets) and offsets[x][0] <= pos: x += 1 x -= 1 if x >= 0: return offsets[x][1] else: return 0 def _sumIndels(ss): '''sum indels within ss''' c = 0 for s in ss: c += len(s) - 1 return c def _getEndOffsets(ss): '''get the offset at exons due to deletions at start/end of exon.''' l = len(ss) x = 0 while x < l and ss[x] == "": x += 1 start_offset = x x = l - 1 while x >= 0 and ss[x] == "": x -= 1 if x >= 0: return start_offset, (l - 1) - x else: return start_offset, 0 def _addCds2Reference(map_cds2reference, cds_start, cds_seq, reference_start): '''add cds to reference''' c, r = cds_start, reference_start for x in cds_seq: l = len(x) if l == 0: r += 1 else: map_cds2reference.addPair(c, r) c += l r += 1 # counts is_splice_truncated = False is_nmd_knockout = False is_stop_truncated = False nuncorrected_frameshifts = 0 ncorrected_frameshifts = 0 nframeshifts = 0 nsplice_noncanonical = 0 reference_first_stop_start = -1 reference_first_stop_end = -1 # map between the new cds sequence and the reference # sequence map_cds2reference = alignlib_lite.py_makeAlignmentBlocks() ################################################### # process first exon exon = transcript[0] transcript_id = exon.transcript_id # collect offset for exon.start genome_start = exon.start genome_start += _getOffset(genome_start, offsets) lcds, cds = 0, [] cds_starts = [0] # still need to deal with deletions of first base: exon_starts = [genome_start] exon_key = (exon.start, exon.end) exon_sequence = exons[exon_key] exon_seq = "".join(exon_sequence) cds.append(exon_seq) _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start) lcds = len(exon_seq) if len(exon_seq) != exon.end - exon.start: nframeshifts += 1 # add first exon to genome position genome_pos = genome_start + len(exon_seq) last_end = exon.end # correct for deletions at start/end of exon start_offset, end_offset = _getEndOffsets(exon_sequence) # length of original transcript loriginal = sum([x.end - x.start for x in transcript]) if E.global_options.loglevel >= 8: print("%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end)) for x, c in enumerate(exons[exon_key]): if len(c) != 1: print(x + exon.start, ":%s:" % c) print() print(exons[exon_key]) print("genome_pos=", genome_pos, ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), ", len(exon_seq)=", len(exon_seq), ", len(exon)=", exon.end - exon.start, ", offsets=%i,%i," % (start_offset, end_offset), ", offset at start=", getOffset(exon.start, offsets), ", offset at end=", getOffset(exon.end, offsets)) for exon in transcript[1:]: last_exon_sequence = exon_sequence last_start_offset, last_end_offset = start_offset, end_offset # get the next intron/exon parameters exon_key = (exon.start, exon.end) exon_sequence = exons[exon_key] start_offset, end_offset = _getEndOffsets(exon_sequence) intron_key = (last_end, exon.start) if last_end == exon.start: # catch empty introns intron_sequence = [] intron_key = None else: intron_sequence = introns[intron_key] intron_seq = "".join(intron_sequence) ################################################### ################################################### ################################################### # add preceding intron new_exon = True if len(intron_seq) > frameshiftsize: intron_name, intron_seq5, intron_seq3 = Genomics.GetIntronType( intron_seq) if intron_name == "unknown": if intron_seq[:2].islower() and intron_seq[-2:].islower(): E.debug( "%s: transcript has unknown splice signal - kept because not a variant: %s: %s:%s" % (transcript_id, intron_name, intron_seq5, intron_seq3)) nsplice_noncanonical += 1 else: is_splice_truncated = True E.debug( "%s: transcript has splice truncated allele: %s: %s:%s" % (transcript_id, intron_name, intron_seq5, intron_seq3)) break # start a new exon cds_starts.append(lcds) else: # treat as frameshifting intron # # frame-shifting introns are checked if they are # fixed by indels either in the intron itself or # the terminal exon sequence. To this end, the effective # size of the intron is computed: # effective size of intron = # indels at terminal x bases at previous exon # + size of intron # + indels at terminal x bases at next exon effective_intron_size = len(intron_seq) previous_indels = _sumIndels( last_exon_sequence[max(0, -frameshiftsize):]) next_indels = _sumIndels(exon_sequence[:frameshiftsize]) effective_intron_size += previous_indels + next_indels if previous_indels + next_indels == 0 and len( intron_seq) % 3 == 0: has_stop = "X" in Genomics.translate(intron_seq.upper(), is_seleno=is_seleno) else: has_stop = False if effective_intron_size % 3 == 0 and not has_stop: E.debug( "%s: fixed frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i)" % ( transcript_id, last_end, exon.start, effective_intron_size, len(intron_seq), previous_indels, next_indels, )) # add to previous exon cds.append(intron_seq) lcds += len(intron_seq) ncorrected_frameshifts += 1 new_exon = False else: E.debug( "%s: could not fix frame-shifting intron %i-%i of size %i (size:%i, indels:%i,%i, has_stop=%i)" % (transcript_id, last_end, exon.start, effective_intron_size, len(intron_seq), previous_indels, next_indels, has_stop)) nuncorrected_frameshifts += 1 # start a new exon cds_starts.append(lcds) if E.global_options.loglevel >= 8: print("%i: intron_indels (%i-%i):" % (allele_id, last_end, exon.start)) if intron_key: for x, c in enumerate(introns[intron_key]): if len(c) != 1: print(x + last_end, ":%s:" % c) print() print(introns[intron_key]) print( "genome_pos=", genome_pos, ",intron=%i-%i" % (genome_pos, genome_pos + len(intron_seq)), ", len(intron_seq)=", len(intron_seq), ", len(intron)=", exon.start - last_end, ", offset at start=", _getOffset(last_end, offsets), ", offset at end=", _getOffset(exon.start, offsets)) else: print("empty intron") genome_pos += len(intron_seq) # assertion - check if genomic coordinate of intron is consistent # with offset test_offset = _getOffset(exon.start, offsets) is_offset = genome_pos - exon.start assert is_offset == test_offset, "intron offset difference: %i != %i" % ( is_offset, test_offset) ################################################### ################################################### ################################################### # add the exon exon_seq = "".join(exon_sequence) cds.append(exon_seq) if len(exon_seq) != exon.end - exon.start: nframeshifts += 1 if new_exon: if reference_coordinates: exon_starts.append(exon.start + start_offset) else: exon_starts.append(genome_pos) _addCds2Reference(map_cds2reference, lcds, exon_sequence, exon.start) lcds += len(exon_seq) last_end = exon.end if E.global_options.loglevel >= 8: print("%i: exon_indels (%i-%i):" % (allele_id, exon.start, exon.end)) for x, c in enumerate(exons[exon_key]): if len(c) != 1: print(x + exon.start, ":%s:" % c) print() print(exons[exon_key]) print("genome_pos=", genome_pos, ",exon=%i-%i" % (genome_pos, genome_pos + len(exon_seq)), ", len(exon_seq)=", len(exon_seq), ", len(exon)=", exon.end - exon.start, ", offsets=%i,%i," % (start_offset, end_offset), ", offset at start=", getOffset(exon.start, offsets), ", offset at end=", getOffset(exon.end, offsets)) genome_pos += len(exon_seq) test_offset = _getOffset(exon.end, offsets) is_offset = genome_pos - exon.end assert is_offset == test_offset, "exon offset difference: %i != %i" % ( is_offset, test_offset) cds = "".join(cds) assert lcds == len(cds) # fix incomplete codons at the end of the sequence if lcds % 3 != 0: offset = lcds % 3 cds = cds[:-offset] # add frame correction for transcripts that do not start at frame=0 start_frame = (3 - (int(transcript[0].frame) % 3)) % 3 # n are ignored (? in sequence to deal with genes like Muc2) peptide = Genomics.translate("n" * start_frame + cds, is_seleno=is_seleno, prefer_lowercase=False, ignore_n=True) # find the first stop codon if start_frame != 0: # ignore first, potentially incomplete base pep_first_stop = peptide.upper().find("X", 1) else: pep_first_stop = peptide.upper().find("X") E.debug("%s: translated peptide = %s, first stop at %i" % (transcript_id, peptide, pep_first_stop)) peptide = peptide.replace("?", "x") if E.global_options.loglevel >= 8: E.debug("peptide=%s" % peptide) E.debug("cds=%s" % cds) E.debug("%s: start_frame=%i, first stop at %i/%i" % (transcript_id, start_frame, pep_first_stop, len(peptide))) lpeptide, lcds = len(peptide), len(cds) # check for non-sense mediated decay if pep_first_stop != -1: cds_first_stop = pep_first_stop * 3 - start_frame if cds_first_stop < cds_starts[-1]: if ncorrected_frameshifts or nuncorrected_frameshifts: E.warn( "nmd knockout transcript %s has frameshifts: %i corrected, %i uncorrected" % (transcript_id, ncorrected_frameshifts, nuncorrected_frameshifts)) is_nmd_knockout = True cds = peptide = "" lpeptide, lcds = 0, 0 reference_first_stop_start, reference_first_stop_end = \ (map_cds2reference.mapRowToCol(cds_first_stop), map_cds2reference.mapRowToCol(cds_first_stop + 3)) elif pep_first_stop < len(peptide) - 1: is_stop_truncated = True cds = cds[:cds_first_stop] peptide[:pep_first_stop] lpeptide, lcds = len(peptide), len(cds) reference_first_stop_start, reference_first_stop_end = \ (map_cds2reference.mapRowToCol(cds_first_stop), map_cds2reference.mapRowToCol(cds_first_stop + 3)) else: E.warn( "first stop at %i(cds=%i) ignored: last exon start at %i" % (pep_first_stop, cds_first_stop, cds_starts[-1])) else: # -1 for no stop codon found pep_first_stop = -1 cds_first_stop = -1 lpeptide, lcds = len(peptide), len(cds) if peptide is None and nframeshifts == 0: E.warn( "transcript %s is knockout, though there are no indels - must be nonsense mutation" % (transcript_id)) # build frames frames = [start_frame] start = start_frame l = 0 for end in cds_starts[1:]: l += end - start frames.append((3 - l % 3) % 3) start = end return Allele._make(( cds, peptide, len(cds_starts), cds_starts, exon_starts, frames, is_nmd_knockout, is_splice_truncated, is_stop_truncated, nframeshifts, ncorrected_frameshifts, nuncorrected_frameshifts, pep_first_stop, lpeptide, cds_first_stop, lcds, reference_first_stop_start, reference_first_stop_end, loriginal, nsplice_noncanonical, )), map_cds2reference
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $", usage=globals()["__doc__"]) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) ## do sth ninput, nskipped, noutput = 0, 0, 0 psl = None def chain_iterator(infile): lines = [] for line in options.stdin: if line.startswith("#"): continue if line.strip() == "": continue if line.startswith("chain"): if lines: yield lines lines = [] lines.append(line) yield lines for lines in chain_iterator(options.stdin): ninput += 1 psl = Blat.Match() (_, _, psl.mSbjctId, target_length, target_strand, target_start, target_end, psl.mQueryId, query_length, query_strand, query_start, query_end, alignment_id) = lines[0][:-1].split() ( psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength, psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength ) = \ [ int(x) for x in (query_start, query_end, query_length, target_start, target_end, target_length) ] map_query2target = alignlib_lite.py_makeAlignmentBlocks() qstart, tstart = psl.mQueryStart, psl.mSbjctStart for line in lines[1:-1]: size, dt, dq = [int(x) for x in line[:-1].split()] map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart) qstart += size + dq tstart += size + dt size = int(lines[-1][:-1]) map_query2target.addDiagonal(qstart, qstart + size, tstart - qstart) psl.fromMap(map_query2target) # sort out strand # target_strand is always positive assert (target_strand == "+") # if query strand is negative if query_strand == "-": # invert both query and target psl.switchTargetStrand() # manually invert the query coordinates psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom options.stdout.write("%s\n" % psl) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) ## write footer and output benchmark information. E.Stop()
def process(matches): new = matches[0].copy() map_query2target = alignlib_lite.py_makeAlignmentBlocks() graph = networkx.DiGraph() graph.add_nodes_from(range(len(matches) + 2)) matches.sort(key=lambda x: x.mQueryFrom) if Genomics.IsPositiveStrand(matches[0].strand): f = lambda x, y: x.mSbjctTo < y.mSbjctFrom else: f = lambda x, y: x.mSbjctFrom > y.mSbjctTo for x in range(0, len(matches)): xx = matches[x] if options.loglevel >= 6: options.stdlog.write("# graph: %2i %s\n" % (x, str(xx))) for y in range(x + 1, len(matches)): yy = matches[y] d = min(xx.mQueryTo, yy.mQueryTo) - \ max(xx.mQueryFrom, yy.mQueryFrom) if d > 0 or not f(xx, yy): continue else: graph.add_edge(x, y, {'weight': -d}) source = len(matches) target = len(matches) + 1 for x in range(len(matches)): xx = matches[x] graph.add_edge(source, x, {'weight': xx.mQueryFrom}) graph.add_edge( x, target, {'weight': xx.mQueryLength - xx.mQueryTo}) if options.loglevel >= 6: networkx.write_edgelist(graph, options.stdlog) path = networkx.dijkstra_path(graph, source, target) if options.loglevel >= 6: options.stdlog.write("# path: %s\n" % (str(path))) new_matches = [matches[x] for x in path[1:-1]] if len(matches) != len(new_matches): E.warn(("query=%s, target=%s, strand=%s: " "removed overlapping/out-of-order segments: " "before=%i, after=%i") % (matches[0].mQueryId, matches[0].mSbjctId, matches[0].strand, len(matches), len(new_matches))) matches = new_matches for match in matches: m = match.getMapQuery2Target() alignlib_lite.py_addAlignment2Alignment(map_query2target, m) new.fromMap(map_query2target, use_strand=True) options.stdout.write(str(new) + "\n") options.stdout.flush() return 1
def pslMap(options): """thread psl alignments using intervals. """ if options.format == "gtf": use_copy = False else: use_copy = True c = E.Counter() min_length = options.min_aligned for match, qx, tx in iterator_psl_intervals(options): map_query2target = match.getMapQuery2Target() c.input += 1 # if no filter on qx or tx, use full segment if qx is None: qx = [(match.mQueryFrom, match.mQueryTo, 0)] elif tx is None: tx = [(match.mSbjctFrom, match.mSbjctTo, 0)] E.debug('matches in query: %s' % qx) E.debug('matches in target: %s' % tx) # if no overlap: return if not qx or not tx: c.skipped += 1 E.debug("no matches in query or target - skipped") continue for query in qx: qstart, qend, qval = query # skip elements that are too small if qend - qstart < min_length: E.debug("query too small - skipped at %s:%i-%i" % (match.mQueryId, qstart, qend)) c.skipped_small_queries += 1 continue E.debug("working on query %s:%i-%i" % (match.mQueryId, qstart, qend)) mqstart, mqend = ( map_query2target.mapRowToCol( qstart, alignlib_lite.py_RIGHT), map_query2target.mapRowToCol( qend, alignlib_lite.py_LEFT)) if match.strand == "-": qstart, qend = match.mQueryLength - \ qend, match.mQueryLength - qstart for target in tx: tstart, tend, tval = target if (tstart >= mqend or tend <= mqstart): E.debug("no overlap: %i-%i (%i-%i) - %i-%i" % ( qstart, qend, mqstart, mqend, tstart, tend)) continue if tend - tstart < min_length: E.debug("target length too short: %i-%i - %i-%i" % ( qstart, qend, tstart, tend)) continue new = alignlib_lite.py_makeAlignmentBlocks() if use_copy: # do copy with range filter if options.loglevel >= 3: mtstart, mtend = map_query2target.mapColToRow( tstart), map_query2target.mapColToRow(tend) E.debug( ("query: %i-%i (len=%i)-> %i-%i(len=%i); " "target: %i-%i (len=%i)-> %i-%i (len=%i)") % (qstart, qend, qend - qstart, mqstart, mqend, mqend - mqstart, tstart, tend, tend - tstart, mtstart, mtend, mtend - mtstart)) alignlib_lite.py_copyAlignment( new, map_query2target, qstart, qend, tstart, tend) else: # do copy with alignment filter map_query = qval if map_query: tmp = alignlib_lite.py_makeAlignmentBlocks() alignlib_lite.py_copyAlignment( tmp, map_query2target, map_query, alignlib_lite.py_RR) if options.loglevel >= 5: options.stdlog.write( "######## mapping query ###########\n") options.stdlog.write( "# %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( map_query2target))) options.stdlog.write( "# %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_query))) options.stdlog.write( "# %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( tmp))) else: tmp = map_query2target map_target = tval if map_target: new = alignlib_lite.py_makeAlignmentBlocks() alignlib_lite.py_copyAlignment( new, tmp, map_target, alignlib_lite.py_CR) if options.loglevel >= 5: options.stdlog.write( "######## mapping target ###########\n") options.stdlog.write( "# before: %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( tmp))) options.stdlog.write( "# map : %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( map_target))) options.stdlog.write( "# after : %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( new))) else: new = tmp if options.loglevel >= 4: E.debug("putative match with intervals: %s and %s: %i-%i" % (str(query), str(target), qstart, qend)) if options.loglevel >= 5: E.debug( "input : %s" % str( alignlib_lite.py_AlignmentFormatEmissions( map_query2target))) E.debug("final : %s" % str(alignlib_lite.py_AlignmentFormatEmissions( new))) if new.getLength() > 0: n = match.copy() n.fromMap(new, use_strand=True) E.info("match : %s" % (str(n))) if new.getNumAligned() > options.min_aligned: n = match.copy() n.fromMap(new, use_strand=True) options.stdout.write(str(n) + "\n") c.output += 1 else: c.discarded += 1 break else: c.nooverlap += 1 E.info("map: %s" % str(c))
def process(matches): new = matches[0].copy() map_query2target = alignlib_lite.py_makeAlignmentBlocks() graph = networkx.DiGraph() graph.add_nodes_from(xrange(len(matches) + 2)) matches.sort(key=lambda x: x.mQueryFrom) if Genomics.IsPositiveStrand(matches[0].strand): f = lambda x, y: x.mSbjctTo < y.mSbjctFrom else: f = lambda x, y: x.mSbjctFrom > y.mSbjctTo for x in range(0, len(matches)): xx = matches[x] if options.loglevel >= 6: options.stdlog.write("# graph: %2i %s\n" % (x, str(xx))) for y in range(x + 1, len(matches)): yy = matches[y] d = min(xx.mQueryTo, yy.mQueryTo) - \ max(xx.mQueryFrom, yy.mQueryFrom) if d > 0 or not f(xx, yy): continue else: graph.add_edge(x, y, {'weight': -d}) source = len(matches) target = len(matches) + 1 for x in range(len(matches)): xx = matches[x] graph.add_edge(source, x, {'weight': xx.mQueryFrom}) graph.add_edge( x, target, {'weight': xx.mQueryLength - xx.mQueryTo}) if options.loglevel >= 6: networkx.write_edgelist(graph, options.stdlog) path = networkx.dijkstra_path(graph, source, target) if options.loglevel >= 6: options.stdlog.write("# path: %s\n" % (str(path))) new_matches = [matches[x] for x in path[1:-1]] if len(matches) != len(new_matches): E.warn(("query=%s, target=%s, strand=%s: " "removed overlapping/out-of-order segments: " "before=%i, after=%i") % (matches[0].mQueryId, matches[0].mSbjctId, matches[0].strand, len(matches), len(new_matches))) matches = new_matches for match in matches: m = match.getMapQuery2Target() alignlib_lite.py_addAlignment2Alignment(map_query2target, m) new.fromMap(map_query2target, use_strand=True) options.stdout.write(str(new) + "\n") options.stdout.flush() return 1
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version = "%prog version: $Id: chain2psl.py 2899 2010-04-13 14:37:37Z andreas $", usage = globals()["__doc__"] ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start( parser, argv = argv ) ## do sth ninput, nskipped, noutput = 0, 0, 0 psl = None def chain_iterator( infile ): lines = [] for line in options.stdin: if line.startswith("#"): continue if line.strip() == "": continue if line.startswith("chain"): if lines: yield lines lines = [] lines.append( line ) yield lines for lines in chain_iterator(options.stdin): ninput += 1 psl = Blat.Match() ( _, _, psl.mSbjctId, target_length, target_strand, target_start, target_end, psl.mQueryId, query_length, query_strand, query_start, query_end, alignment_id ) = lines[0][:-1].split() ( psl.mQueryStart, psl.mQueryEnd, psl.mQueryLength, psl.mSbjctStart, psl.mSbjctEnd, psl.mSbjctLength ) = \ [ int(x) for x in (query_start, query_end, query_length, target_start, target_end, target_length) ] map_query2target = alignlib_lite.py_makeAlignmentBlocks() qstart, tstart = psl.mQueryStart, psl.mSbjctStart for line in lines[1:-1]: size, dt, dq = [int(x) for x in line[:-1].split() ] map_query2target.addDiagonal( qstart, qstart + size, tstart - qstart ) qstart += size + dq tstart += size + dt size = int(lines[-1][:-1]) map_query2target.addDiagonal( qstart, qstart + size, tstart - qstart ) psl.fromMap( map_query2target ) # sort out strand # target_strand is always positive assert( target_strand == "+" ) # if query strand is negative if query_strand == "-": # invert both query and target psl.switchTargetStrand() # manually invert the query coordinates psl.mQueryFrom, psl.mQueryTo = psl.mQueryLength - psl.mQueryTo, psl.mQueryLength - psl.mQueryFrom options.stdout.write("%s\n" % psl ) noutput += 1 E.info( "ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput,nskipped) ) ## write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: psl2wiggle_stats.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("--wiggle-files", dest="wiggle_files", type="string", help="glob expression for wiggle files [%default].") parser.add_option("--prefix", dest="prefix", type="string", help="prefix to add to contig names before lookup [%default].") parser.add_option("-z", "--from-zipped", dest="from_zipped", action="store_true", help="input is zipped.") parser.add_option("--test", dest="test", type="int", help="test - stop after # rows of parsing [%default].") parser.add_option("--with-values", dest="with_values", action="store_true", help="output values in last column [%default].") parser.set_defaults(wiggle_files="*.data.bz2", from_zipped=False, prefix="", with_values=False, test=None) (options, args) = E.Start(parser, add_pipe_options=True) # open indexed access to wiggles wiggle_files = glob.glob(options.wiggle_files) if not wiggle_files: raise IOError("could not find wiggle files with '%s'" % options.wiggle_files) index = Wiggle.WiggleMultiIndexedAccess(wiggle_files, keep_open=True, use_cache=False) iterator = Blat.BlatIterator(sys.stdin) ninput, noutput, nskipped = 0, 0, 0 options.stdout.write( "query\tnali\t%s" % ("\t".join(Stats.DistributionalParameters().getHeaders()))) if options.with_values: options.stdout.write("\tvalues") options.stdout.write("\n") while 1: if options.test and ninput >= options.test: break match = iterator.next() if match is None: break ninput += 1 if options.loglevel >= 2: options.stdlog.write(str(match) + "\n") # psl always matches on the forward strand map_genome2query = alignlib_lite.py_makeAlignmentBlocks() f = alignlib_lite.py_AlignmentFormatBlat("%i\t%i\t%i\t%i\t%s\t%s\t%s\n" % ( match.mSbjctFrom, match.mSbjctTo, match.mQueryFrom, match.mQueryTo, match.mSbjctBlockStarts, match.mQueryBlockStarts, match.mBlockSizes)) f.copy(map_genome2query) data = index.get(options.prefix + match.mSbjctId, match.mSbjctFrom, match.mSbjctTo) values = [] for x, vv in data: for v in vv: if map_genome2query.mapRowToCol(x) >= 0: values.append(v) x += 1 if len(values) == 0: nskipped += 1 continue noutput += 1 if options.loglevel >= 2: options.stdlog.write( "# %s\n" % ",".join(["%5.3f" % v for v in values])) s = Stats.DistributionalParameters(values) options.stdout.write("%s\t%i\t%s" % (match.mQueryId, match.mNMismatches + match.mNMatches, str(s))) if options.with_values: options.stdout.write( "\t%s" % ",".join(["%5.3f" % v for v in values])) options.stdout.write("\n") if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
import alignlib_lite as alignlib x = alignlib.py_makeAlignmentBlocks() x.addDiagonal( 10, 100, 0 ) print x.getNumAligned(), x.getRowFrom(), x.getRowTo() f = alignlib.py_AlignmentFormatBlat( x ) print str(f) f.copy( x ) print str(f)