def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]") parser.add_option("-f", "--format", dest="format", type="string", help="output format [Default=%default]") parser.add_option( "-e", "--expand", dest="expand", action="store_true", help= "expand positions from peptide to nucleotide alignment [Default=%default]" ) parser.add_option("-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option("-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help= "expect one-based coordinates. The default are zero based coordinates [Default=%default]." ) parser.add_option("--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]") parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option("-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option("-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option("--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help= "given a set of previous alignments, only write new pairs [Default=%default]." ) parser.set_defaults(filename_sequences=None, filename_exons=None, filename_map=None, filename_outfile=None, no_gaps=False, format="fasta", expand=False, require_codons=False, no_identical=False, min_length=0, report_step=100, one_based_coordinates=False, filename_filter=None) (options, args) = E.Start(parser, add_mysql_options=True) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r")) else: sequences = {} if options.loglevel >= 1: options.stdlog.write("# read %i sequences\n" % len(sequences)) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r")) else: exons = {} if options.loglevel >= 1: options.stdlog.write("# read %i exons\n" % len(exons)) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read(line) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write("# read %i maps\n" % len(map_old2new)) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write("# reading filtering information.\n") sys.stdout.flush() map_pair2hids = {} if os.path.exists(options.filename_filter): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator(infile) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append(s) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids)) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write("# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links(sys.stdin): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write("# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1)) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write("# read link %s\n" % str(link)) row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken]) col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken]) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment(link.mQueryAli, 3) link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli).copy(map_row2col) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in row with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mQueryToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in col with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mSbjctToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError( "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" % (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) + "\n") # check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write("# %s\n" % str(map_row2col)) options.stdlog.write("# %s\n" % str(link)) options.stdlog.write("# %s\n" % str(map_old2new[link.mQueryToken])) options.stdlog.write("# %s\n" % str(map_old2new[link.mSbjctToken])) options.stdlog.write("#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) raise ValueError( "incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) # if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] # Get overlapping segments segments = Exons.MatchExons(map_row2col, exons1, exons2) for a, b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in # the input files. from1, to1 = GetAdjustedBoundaries(a, exons1) from2, to2 = GetAdjustedBoundaries(b, exons2) alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col, from1 + 1, to1, from2 + 1, to2) mode = Write(tmp1_map_row2col, row_seq, col_seq, link, no_gaps=options.no_gaps, no_identical=options.no_identical, min_length=options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile=outfile, pair_filter=map_pair2hid, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write(map_row2col, row_seq, col_seq, link, min_length=options.min_length, no_gaps=options.no_gaps, no_identical=options.no_identical, outfile=outfile, pair_filter=map_pair2hids, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map(lambda x, y: "%s=%i" % (x, y), counts.keys(), counts.values()))) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): parser = E.OptionParser( version= "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option( "--random-proportion", dest="random_proportion", type="float", help="mask randomly columns in multiple alignments [default=%default]") parser.add_option( "--random", dest="random", action="store_true", help="shuffle quality scores before masking [default=%default]") parser.set_defaults( quality_threshold=40, quality_file="quality", filename_map=None, frame=3, ) (options, args) = E.Start(parser) ################################################## ################################################## ################################################## # read map ################################################## infile = open(options.filename_map) map_genes2genome = {} for match in Blat.iterator(infile): assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId map_genes2genome[match.mQueryId] = match infile.close() ################################################## ################################################## ################################################## # get quality scores ################################################## quality = IndexedFasta.IndexedFasta(options.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) ################################################## ################################################## ################################################## # main loop ################################################## ninput, noutput, nmissed = 0, 0, 0 options.stdout.write("cluster_id\tstart\tend\n") for line in options.stdin: if line.startswith("cluster_id"): continue ninput += 1 cluster_id, gene_id, alignment = line[:-1].split("\t") if gene_id not in map_genes2genome: nmissed += 1 E.warn("gene_id %s not found in map." % gene_id) continue match = map_genes2genome[gene_id] map_gene2genome = match.getMapQuery2Target() is_negative = match.strand == "-" # if strand is negative, the coordinates are # on the negative strand of the gene/query # in order to work in the right coordinate system # revert the sequence if is_negative: alignment = alignment[::-1] # get map of gene to alignment map_gene2mali = alignlib_lite.py_makeAlignmentVector() fillAlignment(map_gene2mali, alignment) # get quality scores quality_scores = quality.getSequence(match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome)) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali)) # print quality_scores map_mali2genome = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_combineAlignment(map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR) # print str(alignlib_lite.py_AlignmentFormatEmissions( # map_mali2genome)) # shuffle quality scores, but only those that are aligned if options.random: positions = [] for fp, c in enumerate(alignment): if c == "-": continue y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom if y < 0: continue positions.append(y) scores = [quality_scores[x] for x in positions] random.shuffle(scores) for p, q in zip(positions, scores): quality_scores[p] = q # negative strand to_mask = [] # reverse position rp = len(alignment) for fp, c in enumerate(alignment): rp -= 1 if c == "-": continue y = map_mali2genome.mapRowToCol(fp) - match.mSbjctFrom if y < 0: continue if quality_scores[y] < options.quality_threshold: if is_negative: p = rp else: p = fp E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol(fp), quality_scores[y])) if options.frame > 1: start = (p // options.frame) * options.frame to_mask.extend(list(range(start, start + options.frame))) else: to_mask.append(p) regions = Iterators.group_by_distance(sorted(to_mask)) for start, end in regions: options.stdout.write("%s\t%i\t%i\n" % (cluster_id, start, end)) noutput += 1 E.info("ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed)) E.Stop()
def main(): parser = E.OptionParser( version = "%prog version: $Id: malis2masks.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.add_option("--random-proportion", dest="random_proportion", type="float", help="mask randomly columns in multiple alignments [default=%default]" ) parser.add_option("--random", dest="random", action="store_true", help="shuffle quality scores before masking [default=%default]" ) parser.set_defaults( quality_threshold = 40, quality_file = "quality", filename_map = None, frame = 3, ) (options, args) = E.Start( parser ) ################################################## ################################################## ################################################## ## read map ################################################## infile = open(options.filename_map) map_genes2genome = {} for match in Blat.iterator( infile ): assert match.mQueryId not in map_genes2genome, "duplicate entry %s" % match.mQueryId map_genes2genome[match.mQueryId] = match infile.close() ################################################## ################################################## ################################################## ## get quality scores ################################################## quality = IndexedFasta.IndexedFasta( options.quality_file ) quality.setTranslator( IndexedFasta.TranslatorBytes() ) ################################################## ################################################## ################################################## ## main loop ################################################## ninput, noutput, nmissed = 0, 0, 0 options.stdout.write( "cluster_id\tstart\tend\n" ) for line in options.stdin: if line.startswith("cluster_id"): continue ninput += 1 cluster_id, gene_id, alignment = line[:-1].split("\t") if gene_id not in map_genes2genome: nmissed += 1 E.warn( "gene_id %s not found in map." % gene_id ) continue match = map_genes2genome[gene_id] map_gene2genome = match.getMapQuery2Target() is_negative = match.strand == "-" # if strand is negative, the coordinates are # on the negative strand of the gene/query # in order to work in the right coordinate system # revert the sequence if is_negative: alignment = alignment[::-1] # get map of gene to alignment map_gene2mali = alignlib_lite.py_makeAlignmentVector() fillAlignment( map_gene2mali, alignment ) # get quality scores quality_scores = quality.getSequence( match.mSbjctId, "+", match.mSbjctFrom, match.mSbjctTo) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2genome)) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_gene2mali)) # print quality_scores map_mali2genome = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_combineAlignment( map_mali2genome, map_gene2mali, map_gene2genome, alignlib_lite.py_RR ) # print str(alignlib_lite.py_AlignmentFormatEmissions( map_mali2genome)) # shuffle quality scores, but only those that are aligned if options.random: positions = [] for fp,c in enumerate(alignment): if c == "-": continue y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom if y < 0: continue positions.append( y ) scores = [ quality_scores[ x ] for x in positions ] random.shuffle(scores) for p,q in zip( positions,scores): quality_scores[p] = q # negative strand to_mask = [] ## reverse position rp = len(alignment) for fp,c in enumerate(alignment): rp -= 1 if c == "-": continue y = map_mali2genome.mapRowToCol( fp ) - match.mSbjctFrom if y < 0: continue if quality_scores[y] < options.quality_threshold: if is_negative: p = rp else: p = fp E.debug( "low quality base: id=%s, mali=%i, char=%s, contig=%s, strand=%s, pos=%i, quality=%i" % \ (cluster_id, p, c, match.mSbjctId, match.strand, map_mali2genome.mapRowToCol( fp ), quality_scores[y] ) ) if options.frame > 1: start = (p // options.frame) * options.frame to_mask.extend( list( range(start, start + options.frame) ) ) else: to_mask.append( p ) regions = Iterators.group_by_distance( sorted(to_mask) ) for start,end in regions: options.stdout.write( "%s\t%i\t%i\n" % (cluster_id, start, end ) ) noutput += 1 E.info( "ninput=%i, noutput=%i, nmissed=%i" % (ninput, noutput, nmissed) ) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]" ) parser.add_option( "-f", "--format", dest="format", type="string", help="output format [Default=%default]" ) parser.add_option( "-e", "--expand", dest="expand", action="store_true", help="expand positions from peptide to nucleotide alignment [Default=%default]") parser.add_option( "-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option( "-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help="expect one-based coordinates. The default are zero based coordinates [Default=%default].") parser.add_option( "--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]" ) parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option( "-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option( "-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option( "--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help="given a set of previous alignments, only write new pairs [Default=%default].") parser.set_defaults( filename_sequences = None, filename_exons = None, filename_map = None, filename_outfile = None, no_gaps = False, format = "fasta", expand = False, require_codons = False, no_identical = False, min_length = 0, report_step = 100, one_based_coordinates = False, filename_filter = None) (options, args) = E.Start( parser, add_mysql_options = True ) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") ) else: sequences = {} if options.loglevel >= 1: options.stdlog.write( "# read %i sequences\n" % len(sequences) ) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries( open(options.filename_exons, "r") ) else: exons = {} if options.loglevel >= 1: options.stdlog.write( "# read %i exons\n" % len(exons) ) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read( line ) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write( "# read %i maps\n" % len(map_old2new) ) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write( "# reading filtering information.\n" ) sys.stdout.flush() map_pair2hids = {} if os.path.exists( options.filename_filter ): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator( infile ) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append( s ) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids) ) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write( "# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links( sys.stdin ): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write( "# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1) ) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write( "# read link %s\n" % str(link) ) row_seq = alignlib_lite.py_makeSequence( sequences[link.mQueryToken] ) col_seq = alignlib_lite.py_makeSequence( sequences[link.mSbjctToken] ) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment( link.mQueryAli, 3 ) link.mSbjctAli = ScaleAlignment( link.mSbjctAli, 3 ) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli ).copy( map_row2col ) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write( "# combining in row with %s\n" %\ str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mQueryToken].mMapOld2New ) )) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR ) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col ) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write( "# combining in col with %s\n" %\ str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mSbjctToken].mMapOld2New ) )) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR ) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col ) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError("out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" %\ (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq )) + "\n" ) ## check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write( "# %s\n" % str(map_row2col) ) options.stdlog.write( "# %s\n" % str(link) ) options.stdlog.write( "# %s\n" % str(map_old2new[link.mQueryToken]) ) options.stdlog.write( "# %s\n" % str(map_old2new[link.mSbjctToken]) ) options.stdlog.write( "#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq ) ) raise ValueError("incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) ## if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] ## Get overlapping segments segments = Exons.MatchExons( map_row2col, exons1, exons2 ) for a,b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in the input files. from1, to1 = GetAdjustedBoundaries( a, exons1 ) from2, to2 = GetAdjustedBoundaries( b, exons2 ) alignlib_lite.py_copyAlignment( tmp1_map_row2col, map_row2col, from1+1, to1, from2+1, to2 ) mode = Write( tmp1_map_row2col, row_seq, col_seq, link, no_gaps = options.no_gaps, no_identical = options.no_identical, min_length = options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile = outfile, pair_filter = map_pair2hid, format = options.format ) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write( map_row2col, row_seq, col_seq, link, min_length = options.min_length, no_gaps = options.no_gaps, no_identical = options.no_identical, outfile = outfile, pair_filter = map_pair2hids, format = options.format ) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map( lambda x,y: "%s=%i" % (x,y), counts.keys(), counts.values() ) )) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped) ) E.Stop()