def expand( self ): if not self.mMapOld2New: self.mMapOld2New = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatEmissions( self.mOldFrom, self.mOldAli, self.mNewFrom, self.mNewAli).copy( self.mMapOld2New )
def expand(self): if not self.mMapOld2New: self.mMapOld2New = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatEmissions( self.mOldFrom, self.mOldAli, self.mNewFrom, self.mNewAli).copy(self.mMapOld2New)
def GetMap( self ): """return map between the two segments.""" if self.mAlignmentFrom1 and self.mAlignmentFrom2: map_a2b = alignlib_lite.py_makeAlignmentVector() alignlib_lite.py_AlignmentFormatEmissions( self.mAlignmentFrom1, self.mAlignment1, self.mAlignmentFrom2, self.mAlignment2 ).copy( map_a2b ) return map_a2b else: return None
def fillFromTable( self, table_row ): if len(table_row) == 25: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString) = table_row elif len(table_row) == 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] elif len(table_row) > 26: ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] else: raise ValueError, "unknown format: %i fields" % len(data) sys.exit(0) if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector() if self.mQueryAli != "" and self.mSbjctAli != "": alignlib_lite.py_AlignmentFormatEmissions( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli ).copy( self.mMapPeptide2Translation ) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
def fillFromTable(self, table_row): if len(table_row) == 25: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString) = table_row elif len(table_row) == 26: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] elif len(table_row) > 26: (self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, self.mQueryAli, self.mSbjctFrom, self.mSbjctTo, self.mSbjctAli, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, self.mPercentIdentity, self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled) = table_row[:26] else: raise ValueError, "unknown format: %i fields" % len(data) sys.exit(0) if self.mExpand: self.mMapPeptide2Translation = alignlib_lite.py_makeAlignmentVector( ) if self.mQueryAli != "" and self.mSbjctAli != "": alignlib_lite.py_AlignmentFormatEmissions( self.mQueryFrom, self.mQueryAli, self.mSbjctFrom, self.mSbjctAli).copy(self.mMapPeptide2Translation) self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString)
def Expand( self ): self.mExpand = True if self.mMapPeptide2Translation.getLength() > 0: f = alignlib_lite.py_AlignmentFormatEmissions( self.mMapPeptide2Translation ) self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom() self.mQueryTo = self.mMapPeptide2Translation.getRowTo() self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom() self.mSbjctTo = self.mMapPeptide2Translation.getColTo() self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString )
def Expand(self): self.mExpand = True if self.mMapPeptide2Translation.getLength() > 0: f = alignlib_lite.py_AlignmentFormatEmissions( self.mMapPeptide2Translation) self.mQueryAli, self.mSbjctAli = f.mRowAlignment, f.mColAlignment self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom() self.mQueryTo = self.mMapPeptide2Translation.getRowTo() self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom() self.mSbjctTo = self.mMapPeptide2Translation.getColTo() self.mMapPeptide2Genome = Genomics.String2Alignment( self.mAlignmentString)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]") parser.add_option("-f", "--format", dest="format", type="string", help="output format [Default=%default]") parser.add_option( "-e", "--expand", dest="expand", action="store_true", help= "expand positions from peptide to nucleotide alignment [Default=%default]" ) parser.add_option("-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option("-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help= "expect one-based coordinates. The default are zero based coordinates [Default=%default]." ) parser.add_option("--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]") parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option("-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option("-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option("--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help= "given a set of previous alignments, only write new pairs [Default=%default]." ) parser.set_defaults(filename_sequences=None, filename_exons=None, filename_map=None, filename_outfile=None, no_gaps=False, format="fasta", expand=False, require_codons=False, no_identical=False, min_length=0, report_step=100, one_based_coordinates=False, filename_filter=None) (options, args) = E.Start(parser, add_mysql_options=True) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r")) else: sequences = {} if options.loglevel >= 1: options.stdlog.write("# read %i sequences\n" % len(sequences)) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r")) else: exons = {} if options.loglevel >= 1: options.stdlog.write("# read %i exons\n" % len(exons)) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read(line) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write("# read %i maps\n" % len(map_old2new)) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write("# reading filtering information.\n") sys.stdout.flush() map_pair2hids = {} if os.path.exists(options.filename_filter): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator(infile) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append(s) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids)) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write("# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links(sys.stdin): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write("# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1)) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write("# read link %s\n" % str(link)) row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken]) col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken]) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment(link.mQueryAli, 3) link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli).copy(map_row2col) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in row with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mQueryToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in col with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mSbjctToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError( "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" % (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) + "\n") # check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write("# %s\n" % str(map_row2col)) options.stdlog.write("# %s\n" % str(link)) options.stdlog.write("# %s\n" % str(map_old2new[link.mQueryToken])) options.stdlog.write("# %s\n" % str(map_old2new[link.mSbjctToken])) options.stdlog.write("#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) raise ValueError( "incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) # if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] # Get overlapping segments segments = Exons.MatchExons(map_row2col, exons1, exons2) for a, b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in # the input files. from1, to1 = GetAdjustedBoundaries(a, exons1) from2, to2 = GetAdjustedBoundaries(b, exons2) alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col, from1 + 1, to1, from2 + 1, to2) mode = Write(tmp1_map_row2col, row_seq, col_seq, link, no_gaps=options.no_gaps, no_identical=options.no_identical, min_length=options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile=outfile, pair_filter=map_pair2hid, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write(map_row2col, row_seq, col_seq, link, min_length=options.min_length, no_gaps=options.no_gaps, no_identical=options.no_identical, outfile=outfile, pair_filter=map_pair2hids, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map(lambda x, y: "%s=%i" % (x, y), counts.keys(), counts.values()))) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def pslMap(options): """thread psl alignments using intervals. """ if options.format == "gtf": use_copy = False else: use_copy = True c = E.Counter() min_length = options.min_aligned for match, qx, tx in iterator_psl_intervals(options): map_query2target = match.getMapQuery2Target() c.input += 1 # if no filter on qx or tx, use full segment if qx is None: qx = [(match.mQueryFrom, match.mQueryTo, 0)] elif tx is None: tx = [(match.mSbjctFrom, match.mSbjctTo, 0)] E.debug('matches in query: %s' % qx) E.debug('matches in target: %s' % tx) # if no overlap: return if not qx or not tx: c.skipped += 1 E.debug("no matches in query or target - skipped") continue for query in qx: qstart, qend, qval = query # skip elements that are too small if qend - qstart < min_length: E.debug("query too small - skipped at %s:%i-%i" % (match.mQueryId, qstart, qend)) c.skipped_small_queries += 1 continue E.debug("working on query %s:%i-%i" % (match.mQueryId, qstart, qend)) mqstart, mqend = ( map_query2target.mapRowToCol( qstart, alignlib_lite.py_RIGHT), map_query2target.mapRowToCol( qend, alignlib_lite.py_LEFT)) if match.strand == "-": qstart, qend = match.mQueryLength - \ qend, match.mQueryLength - qstart for target in tx: tstart, tend, tval = target if (tstart >= mqend or tend <= mqstart): E.debug("no overlap: %i-%i (%i-%i) - %i-%i" % ( qstart, qend, mqstart, mqend, tstart, tend)) continue if tend - tstart < min_length: E.debug("target length too short: %i-%i - %i-%i" % ( qstart, qend, tstart, tend)) continue new = alignlib_lite.py_makeAlignmentBlocks() if use_copy: # do copy with range filter if options.loglevel >= 3: mtstart, mtend = map_query2target.mapColToRow( tstart), map_query2target.mapColToRow(tend) E.debug( ("query: %i-%i (len=%i)-> %i-%i(len=%i); " "target: %i-%i (len=%i)-> %i-%i (len=%i)") % (qstart, qend, qend - qstart, mqstart, mqend, mqend - mqstart, tstart, tend, tend - tstart, mtstart, mtend, mtend - mtstart)) alignlib_lite.py_copyAlignment( new, map_query2target, qstart, qend, tstart, tend) else: # do copy with alignment filter map_query = qval if map_query: tmp = alignlib_lite.py_makeAlignmentBlocks() alignlib_lite.py_copyAlignment( tmp, map_query2target, map_query, alignlib_lite.py_RR) if options.loglevel >= 5: options.stdlog.write( "######## mapping query ###########\n") options.stdlog.write( "# %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( map_query2target))) options.stdlog.write( "# %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_query))) options.stdlog.write( "# %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( tmp))) else: tmp = map_query2target map_target = tval if map_target: new = alignlib_lite.py_makeAlignmentBlocks() alignlib_lite.py_copyAlignment( new, tmp, map_target, alignlib_lite.py_CR) if options.loglevel >= 5: options.stdlog.write( "######## mapping target ###########\n") options.stdlog.write( "# before: %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( tmp))) options.stdlog.write( "# map : %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( map_target))) options.stdlog.write( "# after : %s\n" % str(alignlib_lite.py_AlignmentFormatEmissions( new))) else: new = tmp if options.loglevel >= 4: E.debug("putative match with intervals: %s and %s: %i-%i" % (str(query), str(target), qstart, qend)) if options.loglevel >= 5: E.debug( "input : %s" % str( alignlib_lite.py_AlignmentFormatEmissions( map_query2target))) E.debug("final : %s" % str(alignlib_lite.py_AlignmentFormatEmissions( new))) if new.getLength() > 0: n = match.copy() n.fromMap(new, use_strand=True) E.info("match : %s" % (str(n))) if new.getNumAligned() > options.min_aligned: n = match.copy() n.fromMap(new, use_strand=True) options.stdout.write(str(n) + "\n") c.output += 1 else: c.discarded += 1 break else: c.nooverlap += 1 E.info("map: %s" % str(c))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.add_option("-m", "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode, global=nw, local=sw [default=%default].") parser.set_defaults( gop=-12.0, gep=-2.0, format="fasta", mode="local", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info("read 2 multiple alignments") mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format) mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format) cmali1 = Mali.convertMali2Alignlib(mali1) cmali2 = Mali.convertMali2Alignlib(mali2) if options.mode == "local": mode = alignlib_lite.py_ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib_lite.py_ALIGNMENT_GLOBAL alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop, options.gep) alignlib_lite.py_setDefaultEncoder( alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20)) alignlib_lite.py_setDefaultLogOddor( alignlib_lite.py_makeLogOddorDirichlet(0.3)) alignlib_lite.py_setDefaultRegularizor( alignlib_lite.py_makeRegularizorDirichletPrecomputed()) cprofile1 = alignlib_lite.py_makeProfile(cmali1) cprofile2 = alignlib_lite.py_makeProfile(cmali2) result = alignlib_lite.py_makeAlignmentVector() alignator.align(result, cprofile1, cprofile2) E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result)) cmali1.add(cmali2, result) outmali = Mali.convertAlignlib2Mali(cmali1, identifiers=mali1.getIdentifiers() + mali2.getIdentifiers()) outmali.writeToFile(options.stdout, format=options.format) # write footer and output benchmark information. E.Stop()
alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_LOCAL, param_gop, param_gep) map_query2token = alignlib_lite.py_makeAlignmentVector() for line in sys.stdin: if line[0] == "#": continue query_token, sbjct_token, query_sequence, sbjct_sequence = string.split( line[:-1], "\t") map_query2token.clear() row = alignlib_lite.py_makeSequence(query_sequence) col = alignlib_lite.py_makeSequence(sbjct_sequence) alignator.align(map_query2token, row, col) pidentity = 100.0 * \ alignlib_lite.py_calculatePercentIdentity( map_query2token, row, col) psimilarity = 100.0 * \ alignlib_lite.py_calculatePercentSimilarity(map_query2token) print string.join( map(str, (query_token, sbjct_token, map_query2token.getScore(), alignlib_lite.py_AlignmentFormatEmissions(map_query2token), pidentity, psimilarity, map_query2token.getNumGaps())), "\t") if __name__ == "__main__": sys.exit(main(sys.argv))
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: peptides2cds.py 2890 2010-04-07 08:58:54Z andreas $") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide sequences [%default]." ) parser.add_option("-c", "--cds", "--cdnas", dest="filename_cdna", type="string", help="filename with cdna sequences [%default]." ) parser.add_option("-m", "--map", dest="filename_map", type="string", help="filename with map of peptide identifiers to cdna identifiers [%default]." ) parser.add_option( "--output-identifier", dest="output_identifier", type="choice", choices=("cdna", "peptide"), help="output identifier to use [%default]." ) parser.add_option("-f", "--output-format=", dest="output_format", type="choice", choices=("alignment", "fasta"), help="output format.") parser.set_defaults( peptides=None, filename_cdna = None, output_format="alignment", filename_map = None, stop_codons = ("TAG", "TAA", "TGA"), output_identifier = "peptide", ) (options, args) = E.Start( parser, add_pipe_options = True ) if not options.filename_cdna: raise ValueError("please supply filename with cds sequences.") if options.filename_peptides: infile = open(options.filename_peptides, "r") E.info("reading from %s" % options.filename_peptides) else: E.info("reading from stdin") infile = sys.stdin if options.filename_map: E.info( "reading map" ) map_peptide2cds = IOTools.readMap( IOTools.openFile( options.filename_map, "r" ) ) E.info( "read map for %i identifiers" % len(map_peptide2cds) ) else: map_peptide2cds = {} E.info( "reading cds sequences" ) cds_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_cdna, "r") ) E.info( "read %i cds sequences" % len(cds_sequences)) ninput, noutput = 0, 0 nskipped, nnosequence = 0, 0 # iterate over peptide sequences iterator = FastaIterator.FastaIterator( infile ) use_cds_id = options.output_identifier == "cds" for cur_record in iterator: ninput += 1 peptide_identifier = re.split("\s+", cur_record.title)[0] cds_identifier = map_peptide2cds.get( peptide_identifier, peptide_identifier ) if cds_identifier not in cds_sequences: nnosequence += 1 continue p = cur_record.sequence c = cds_sequences[cds_identifier] E.debug("processing %s: laa=%i (without gaps=%i), lna=%i" % (peptide_identifier, len(p), len(re.sub("-", "", p)), len(c))) try: map_p2c = getMapPeptide2Cds( p, c, options ) except ValueError: nskipped += 1 continue if use_cds_id: identifier = cds_identifier else: identifier = peptide_identifier if options.output_format =="alignment": options.stdout.write("\t".join( map(str, (identifier, alignlib_lite.py_AlignmentFormatEmissions( map_p2c ), len(cur_record.sequence), len(cds_sequences[identifier])) ) )+"\n") elif options.output_format == "fasta": map_p2c.switchRowCol() alignatum = alignlib_lite.py_makeAlignatum( c ) alignatum.mapOnAlignment( map_p2c, len(p) * 3 ) s = alignatum.getString() if len(s) != len(p) * 3: raise ValueError ("incomplete aligned string for %s: %s, cds=%s" % (cur_record.title, s, c )) options.stdout.write( ">%s\n%s\n" % (identifier, s )) noutput += 1 sys.stdout.flush() E.info( "ninput=%i, noutput=%i, nnosequence=%i, nskipped=%i" % (ninput, noutput, nnosequence, nskipped) ) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]" ) parser.add_option( "-f", "--format", dest="format", type="string", help="output format [Default=%default]" ) parser.add_option( "-e", "--expand", dest="expand", action="store_true", help="expand positions from peptide to nucleotide alignment [Default=%default]") parser.add_option( "-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option( "-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help="expect one-based coordinates. The default are zero based coordinates [Default=%default].") parser.add_option( "--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]" ) parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option( "-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option( "-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option( "--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help="given a set of previous alignments, only write new pairs [Default=%default].") parser.set_defaults( filename_sequences = None, filename_exons = None, filename_map = None, filename_outfile = None, no_gaps = False, format = "fasta", expand = False, require_codons = False, no_identical = False, min_length = 0, report_step = 100, one_based_coordinates = False, filename_filter = None) (options, args) = E.Start( parser, add_mysql_options = True ) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r") ) else: sequences = {} if options.loglevel >= 1: options.stdlog.write( "# read %i sequences\n" % len(sequences) ) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries( open(options.filename_exons, "r") ) else: exons = {} if options.loglevel >= 1: options.stdlog.write( "# read %i exons\n" % len(exons) ) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read( line ) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write( "# read %i maps\n" % len(map_old2new) ) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write( "# reading filtering information.\n" ) sys.stdout.flush() map_pair2hids = {} if os.path.exists( options.filename_filter ): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator( infile ) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append( s ) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids) ) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write( "# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links( sys.stdin ): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write( "# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1) ) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write( "# read link %s\n" % str(link) ) row_seq = alignlib_lite.py_makeSequence( sequences[link.mQueryToken] ) col_seq = alignlib_lite.py_makeSequence( sequences[link.mSbjctToken] ) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment( link.mQueryAli, 3 ) link.mSbjctAli = ScaleAlignment( link.mSbjctAli, 3 ) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli ).copy( map_row2col ) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write( "# combining in row with %s\n" %\ str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mQueryToken].mMapOld2New ) )) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR ) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col ) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write( "# combining in col with %s\n" %\ str(alignlib_lite.py_AlignmentFormatEmissions(map_old2new[link.mSbjctToken].mMapOld2New ) )) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR ) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment( map_row2col, tmp1_map_row2col ) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError("out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" %\ (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq )) + "\n" ) ## check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write( "# %s\n" % str(map_row2col) ) options.stdlog.write( "# %s\n" % str(link) ) options.stdlog.write( "# %s\n" % str(map_old2new[link.mQueryToken]) ) options.stdlog.write( "# %s\n" % str(map_old2new[link.mSbjctToken]) ) options.stdlog.write( "#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq ) ) raise ValueError("incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) ## if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] ## Get overlapping segments segments = Exons.MatchExons( map_row2col, exons1, exons2 ) for a,b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in the input files. from1, to1 = GetAdjustedBoundaries( a, exons1 ) from2, to2 = GetAdjustedBoundaries( b, exons2 ) alignlib_lite.py_copyAlignment( tmp1_map_row2col, map_row2col, from1+1, to1, from2+1, to2 ) mode = Write( tmp1_map_row2col, row_seq, col_seq, link, no_gaps = options.no_gaps, no_identical = options.no_identical, min_length = options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile = outfile, pair_filter = map_pair2hid, format = options.format ) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write( map_row2col, row_seq, col_seq, link, min_length = options.min_length, no_gaps = options.no_gaps, no_identical = options.no_identical, outfile = outfile, pair_filter = map_pair2hids, format = options.format ) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map( lambda x,y: "%s=%i" % (x,y), counts.keys(), counts.values() ) )) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped) ) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. " "Requires alignlib_lite.py [%default]") parser.add_option( "-1", "--pattern1", dest="pattern1", type="string", help="pattern to extract identifier from in identifiers1. " "[%default]") parser.add_option( "-2", "--pattern2", dest="pattern2", type="string", help="pattern to extract identifier from in identifiers2. " "[%default]") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("diff", "missed", "seqdiff"), help="what to output [%default]") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (options, args) = E.Start(parser) if len(args) != 2: raise ValueError("two files needed to compare.") if options.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ " "but alignlib not found") seqs1 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(IOTools.openFile(args[0], "r")) ]) seqs2 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate(IOTools.openFile(args[1], "r")) ]) if not seqs1: raise ValueError("first file %s is empty." % (args[0])) if not seqs2: raise ValueError("second file %s is empty." % (args[1])) MapIdentifiers(seqs1, options.pattern1) MapIdentifiers(seqs2, options.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in options.output write_missed2 = "missed" in options.output write_seqdiff = "seqdiff" in options.output write_diff = "diff" in options.output or write_seqdiff for k in seqs1: if k not in seqs2: nmissed1 += 1 if write_missed1: options.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: the first and last residues # can be different for peptide sequences when # comparing my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len( filter(lambda x: x[0] == "U" or x[1] == "U", differences)) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len( filter(lambda x: x[0] in "NX" or x[1] in "NX", differences)) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if options.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % ( k, x, a, b, len(s1), len(s2)) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print "fix\t%s\t%s" % (k, str(f)) if not keep: print "# warning: not fixable: %s" % k if write_diff: options.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in seqs2.keys(): if k not in found2: nmissed2 += 1 if write_missed2: options.stdout.write("---- %s ---- %s\n" % (k, "missed2")) options.stdlog.write("""# Legend: # seqs1: number of sequences in set 1 # seqs2: number of sequences in set 2 # same: number of identical sequences # diff: number of sequences with differences # nmissed1: sequences in set 1 that are not found in set 2 # nmissed2: sequences in set 2 that are not found in set 1 # Type of sequence differences # first: only the first residue is different # last: only the last residue is different # prefix: one sequence is prefix of the other # selenocysteine: difference due to selenocysteines # masked: difference due to masked residues # fixed: fixed differences # other: other differences """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info( "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.add_option( "-m", "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode, global=nw, local=sw [default=%default].") parser.set_defaults( gop=-12.0, gep=-2.0, format="fasta", mode="local", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info("read 2 multiple alignments") mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format) mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format) cmali1 = Mali.convertMali2Alignlib(mali1) cmali2 = Mali.convertMali2Alignlib(mali2) if options.mode == "local": mode = alignlib_lite.py_ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib_lite.py_ALIGNMENT_GLOBAL alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop, options.gep) alignlib_lite.py_setDefaultEncoder( alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20)) alignlib_lite.py_setDefaultLogOddor( alignlib_lite.py_makeLogOddorDirichlet(0.3)) alignlib_lite.py_setDefaultRegularizor( alignlib_lite.py_makeRegularizorDirichletPrecomputed()) cprofile1 = alignlib_lite.py_makeProfile(cmali1) cprofile2 = alignlib_lite.py_makeProfile(cmali2) result = alignlib_lite.py_makeAlignmentVector() alignator.align(result, cprofile1, cprofile2) E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result)) cmali1.add(cmali2, result) outmali = Mali.convertAlignlib2Mali(cmali1, identifiers=mali1.getIdentifiers() + mali2.getIdentifiers()) outmali.writeToFile(options.stdout, format=options.format) # write footer and output benchmark information. E.Stop()
def __str__( self ): """get a string representation of results.""" if self.mExpand: if self.mMapPeptide2Translation.getLength() > 0: f = alignlib_lite.py_AlignmentFormatEmissions( self.mMapPeptide2Translation ) row_ali, col_ali = f.mRowAlignment, f.mColAlignment self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom() self.mQueryTo = self.mMapPeptide2Translation.getRowTo() self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom() self.mSbjctTo = self.mMapPeptide2Translation.getColTo() else: row_ali, col_ali = "", "" else: row_ali = self.mQueryAli col_ali = self.mSbjctAli if self.mPredictionId: return string.join( map(str, (\ self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, row_ali, self.mSbjctFrom, self.mSbjctTo, col_ali, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, "%5.2f" % self.mPercentIdentity, "%5.2f" % self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled, )), "\t") else: return string.join( map(str, (\ self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, row_ali, self.mSbjctFrom, self.mSbjctTo, col_ali, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, "%5.2f" % self.mPercentIdentity, "%5.2f" % self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled, )), "\t")
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$") parser.add_option("-p", "--peptides-fasta-file", dest="filename_peptides", type="string", help="filename with peptide sequences [%default].") parser.add_option("-c", "--cds-gtf-file", "--cdnas", dest="filename_cdna", type="string", help="filename with cdna sequences [%default].") parser.add_option( "-m", "--map", dest="filename_map", type="string", help= "filename with map of peptide identifiers to cdna identifiers [%default]." ) parser.add_option("--output-identifier", dest="output_identifier", type="choice", choices=("cdna", "peptide"), help="output identifier to use [%default].") parser.add_option("-f", "--output-format=", dest="output_format", type="choice", choices=("alignment", "fasta"), help="output format.") parser.set_defaults( peptides=None, filename_cdna=None, output_format="alignment", filename_map=None, stop_codons=("TAG", "TAA", "TGA"), output_identifier="peptide", ) (options, args) = E.Start(parser, add_pipe_options=True) if not options.filename_cdna: raise ValueError("please supply filename with cds sequences.") if options.filename_peptides: infile = open(options.filename_peptides, "r") E.info("reading from %s" % options.filename_peptides) else: E.info("reading from stdin") infile = sys.stdin if options.filename_map: E.info("reading map") map_peptide2cds = IOTools.readMap( IOTools.openFile(options.filename_map, "r")) E.info("read map for %i identifiers" % len(map_peptide2cds)) else: map_peptide2cds = {} E.info("reading cds sequences") cds_sequences = Genomics.ReadPeptideSequences( IOTools.openFile(options.filename_cdna, "r")) E.info("read %i cds sequences" % len(cds_sequences)) ninput, noutput = 0, 0 nskipped, nnosequence = 0, 0 # iterate over peptide sequences iterator = FastaIterator.FastaIterator(infile) use_cds_id = options.output_identifier == "cds" for cur_record in iterator: ninput += 1 peptide_identifier = re.split("\s+", cur_record.title)[0] cds_identifier = map_peptide2cds.get(peptide_identifier, peptide_identifier) if cds_identifier not in cds_sequences: nnosequence += 1 continue p = cur_record.sequence c = cds_sequences[cds_identifier] E.debug("processing %s: laa=%i (without gaps=%i), lna=%i" % (peptide_identifier, len(p), len(re.sub("-", "", p)), len(c))) try: map_p2c = Peptides2Cds.getMapPeptide2Cds(p, c, options) except ValueError: nskipped += 1 continue if use_cds_id: identifier = cds_identifier else: identifier = peptide_identifier if options.output_format == "alignment": options.stdout.write("\t".join( map(str, (identifier, alignlib_lite.py_AlignmentFormatEmissions(map_p2c), len(cur_record.sequence), len(cds_sequences[identifier])))) + "\n") elif options.output_format == "fasta": map_p2c.switchRowCol() alignatum = alignlib_lite.py_makeAlignatum(c) alignatum.mapOnAlignment(map_p2c, len(p) * 3) s = alignatum.getString() if len(s) != len(p) * 3: raise ValueError( "incomplete aligned string for %s: %s, cds=%s" % (cur_record.title, s, c)) options.stdout.write(">%s\n%s\n" % (identifier, s)) noutput += 1 sys.stdout.flush() E.info("ninput=%i, noutput=%i, nnosequence=%i, nskipped=%i" % (ninput, noutput, nnosequence, nskipped)) E.Stop()
def __str__(self): """get a string representation of results.""" if self.mExpand: if self.mMapPeptide2Translation.getLength() > 0: f = alignlib_lite.py_AlignmentFormatEmissions( self.mMapPeptide2Translation) row_ali, col_ali = f.mRowAlignment, f.mColAlignment self.mQueryFrom = self.mMapPeptide2Translation.getRowFrom() self.mQueryTo = self.mMapPeptide2Translation.getRowTo() self.mSbjctFrom = self.mMapPeptide2Translation.getColFrom() self.mSbjctTo = self.mMapPeptide2Translation.getColTo() else: row_ali, col_ali = "", "" else: row_ali = self.mQueryAli col_ali = self.mSbjctAli if self.mPredictionId: return string.join( map(str, ( self.mPredictionId, self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, row_ali, self.mSbjctFrom, self.mSbjctTo, col_ali, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, "%5.2f" % self.mPercentIdentity, "%5.2f" % self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled, )), "\t") else: return string.join( map(str, ( self.mQueryToken, self.mSbjctToken, self.mSbjctStrand, self.mRank, self.score, self.mQueryFrom, self.mQueryTo, row_ali, self.mSbjctFrom, self.mSbjctTo, col_ali, self.mQueryLength, self.mQueryCoverage, self.mNGaps, self.mNFrameShifts, self.mNIntrons, self.mNSplits, self.mNStopCodons, "%5.2f" % self.mPercentIdentity, "%5.2f" % self.mPercentSimilarity, self.mTranslation, self.mSbjctGenomeFrom, self.mSbjctGenomeTo, self.mAlignmentString, self.mNAssembled, )), "\t")
def Align( self, method, anchor = 0, loglevel = 1 ): """align a pair of sequences. get rid of this and use a method class instead in the future """ map_a2b = alignlib_lite.py_makeAlignmentVector() s1 = "A" * anchor + self.mSequence1 + "A" * anchor s2 = "A" * anchor + self.mSequence2 + "A" * anchor self.strand = "+" if method == "dialign": dialign = WrapperDialign.Dialign( self.mOptionsDialign ) dialign.Align( s1, s2, map_a2b ) elif method == "blastz": blastz = WrapperBlastZ.BlastZ( self.mOptionsBlastZ ) blastz.Align( s1, s2, map_a2b ) if blastz.isReverseComplement(): self.strand = "-" self.mSequence2 = Genomics.complement( self.mSequence2 ) elif method == "dialignlgs": dialignlgs = WrapperDialign.Dialign( self.mOptionsDialignLGS ) dialignlgs.Align( s1, s2, map_a2b ) elif method == "dba": dba = WrapperDBA.DBA() dba.Align( s1, s2, map_a2b ) elif method == "clustal": raise NotImplementedError( "clustal wrapper needs to be updated") clustal = WrapperClustal.Clustal() clustal.Align( s1, s2, map_a2b ) elif method == "nw": seq1 = alignlib_lite.py_makeSequence( s1 ) seq2 = alignlib_lite.py_makeSequence( s2 ) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_GLOBAL, gop=-12.0, gep=-2.0 ) alignator.align( map_a2b, seq1, seq2 ) elif method == "sw": seq1 = alignlib_lite.py_makeSequence( s1 ) seq2 = alignlib_lite.py_makeSequence( s2 ) alignlib_lite.py_performIterativeAlignment( map_a2b, seq1, seq2, alignator_sw, min_score_sw ) else: ## use callback function method(s1, s2, map_a2b) if map_a2b.getLength() == 0: raise AlignmentError("empty alignment") if anchor: map_a2b.removeRowRegion( anchor + len(self.mSequence1) + 1, map_a2b.getRowTo() ) map_a2b.removeRowRegion( 1, anchor) map_a2b.removeColRegion( anchor + len(self.mSequence2) + 1, map_a2b.getColTo() ) map_a2b.removeColRegion( 1, anchor) map_a2b.moveAlignment( -anchor, -anchor ) f = alignlib_lite.py_AlignmentFormatExplicit( map_a2b, alignlib_lite.py_makeSequence( self.mSequence1), alignlib_lite.py_makeSequence( self.mSequence2) ) self.mMethod = method self.mAlignment = map_a2b self.mAlignedSequence1, self.mAlignedSequence2 = f.mRowAlignment, f.mColAlignment f = alignlib_lite.py_AlignmentFormatEmissions( map_a2b ) self.mAlignment1, self.mAlignment2 = f.mRowAlignment, f.mColAlignment self.mAlignmentFrom1 = map_a2b.getRowFrom() self.mAlignmentTo1 = map_a2b.getRowTo() self.mAlignmentFrom2 = map_a2b.getColFrom() self.mAlignmentTo2 = map_a2b.getColTo() self.mNumGaps, self.mLength = map_a2b.getNumGaps(), map_a2b.getLength() self.mAligned = self.mLength - self.mNumGaps self.SetPercentIdentity() self.SetBlockSizes()
continue query_token, sbjct_token, query_sequence, sbjct_sequence = string.split(line[:-1], "\t") map_query2token.clear() row = alignlib_lite.py_makeSequence(query_sequence) col = alignlib_lite.py_makeSequence(sbjct_sequence) alignator.align(map_query2token, row, col) pidentity = 100.0 * alignlib_lite.py_calculatePercentIdentity(map_query2token, row, col) psimilarity = 100.0 * alignlib_lite.py_calculatePercentSimilarity(map_query2token) print string.join( map( str, ( query_token, sbjct_token, map_query2token.getScore(), alignlib_lite.py_AlignmentFormatEmissions(map_query2token), pidentity, psimilarity, map_query2token.getNumGaps(), ), ), "\t", ) if __name__ == "__main__": sys.exit(main(sys.argv))
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: diff_fasta.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. Requires alignlib_lite.py_ " "[%default]") parser.add_option("-1", "--pattern1", dest="pattern1", type="string", help="pattern to extract identifier from in identifiers1. " "[%default]") parser.add_option("-2", "--pattern2", dest="pattern2", type="string", help="pattern to extract identifier from in identifiers2. " "[%default]") parser.add_option("-o", "--output", dest="output", type="choice", action="append", choices=("diff", "missed", "seqdiff"), help="what to output [%default]") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (options, args) = E.Start(parser) if len(args) != 2: raise ValueError("two files needed to compare.") if options.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ but alignlib not found") seqs1 = Genomics.ReadPeptideSequences(IOTools.openFile(args[0], "r")) seqs2 = Genomics.ReadPeptideSequences(IOTools.openFile(args[1], "r")) if not seqs1: raise ValueError("first file %s is empty." % (args[0])) if not seqs2: raise ValueError("second file %s is empty." % (args[1])) MapIdentifiers(seqs1, options.pattern1) MapIdentifiers(seqs2, options.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in options.output write_missed2 = "missed" in options.output write_seqdiff = "seqdiff" in options.output write_diff = "diff" in options.output or write_seqdiff for k in seqs1: if k not in seqs2: nmissed1 += 1 if write_missed1: options.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: # the first and last residues can be different for peptide sequences when comparing # my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len(filter(lambda x: x[0] == "U" or x[1] == "U", differences)) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len(filter(lambda x: x[0] in "NX" or x[1] in "NX", differences)) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if options.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print "# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2)) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print "fix\t%s\t%s" % (k, str(f)) if not keep: print "# warning: not fixable: %s" % k if write_diff: options.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: options.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in seqs2.keys(): if k not in found2: nmissed2 += 1 if write_missed2: options.stdout.write("---- %s ---- %s\n" % (k, "missed2")) options.stdlog.write( """# Legend: # seqs1: number of sequences in set 1 # seqs2: number of sequences in set 2 # same: number of identical sequences # diff: number of sequences with differences # nmissed1: sequences in set 1 that are not found in set 2 # nmissed2: sequences in set 2 that are not found in set 1 # Type of sequence differences # first: only the first residue is different # last: only the last residue is different # prefix: one sequence is prefix of the other # selenocysteine: difference due to selenocysteines # masked: difference due to masked residues # fixed: fixed differences # other: other differences """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info("ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-s", "--correct-gap-shift", dest="correct_shift", action="store_true", help="correct gap length shifts in alignments. " "Requires alignlib_lite.py ") parser.add_argument( "-1", "--pattern1", dest="pattern1", type=str, help="pattern to extract identifier from in identifiers1. " ) parser.add_argument( "-2", "--pattern2", dest="pattern2", type=str, help="pattern to extract identifier from in identifiers2. " ) parser.add_argument( "-o", "--output-section", dest="output", type=str, action="append", choices=("diff", "missed", "seqdiff"), help="what to output ") parser.set_defaults(correct_shift=False, pattern1="(\S+)", pattern2="(\S+)", output=[]) (args, unknown) = E.start(parser, unknowns=True) if len(unknown) != 2: raise ValueError("two files needed to compare.") if args.correct_shift: try: import alignlib_lite except ImportError: raise ImportError( "option --correct-shift requires alignlib_lite.py_ " "but alignlib not found") seqs1 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate( iotools.open_file(unknown[0], "r"))]) seqs2 = dict([ (x.title, x.sequence) for x in FastaIterator.iterate( iotools.open_file(unknown[1], "r"))]) if not seqs1: raise ValueError("first file %s is empty." % (unknown[0])) if not seqs2: raise ValueError("second file %s is empty." % (unknown[1])) MapIdentifiers(seqs1, args.pattern1) MapIdentifiers(seqs2, args.pattern2) nsame = 0 nmissed1 = 0 nmissed2 = 0 ndiff = 0 ndiff_first = 0 ndiff_last = 0 ndiff_prefix = 0 ndiff_selenocysteine = 0 ndiff_masked = 0 nfixed = 0 found2 = {} write_missed1 = "missed" in args.output write_missed2 = "missed" in args.output write_seqdiff = "seqdiff" in args.output write_diff = "diff" in args.output or write_seqdiff for k in sorted(seqs1): if k not in seqs2: nmissed1 += 1 if write_missed1: args.stdout.write("---- %s ---- %s\n" % (k, "missed1")) continue found2[k] = 1 s1 = seqs1[k].upper() s2 = seqs2[k].upper() m = min(len(s1), len(s2)) if s1 == s2: nsame += 1 else: status = "other" ndiff += 1 if s1[1:] == s2[1:]: ndiff_first += 1 status = "first" elif s1[:m] == s2[:m]: ndiff_prefix += 1 status = "prefix" elif s1[:-1] == s2[:-1]: ndiff_last += 1 status = "last" else: if len(s1) == len(s2): # get all differences: the first and last residues # can be different for peptide sequences when # comparing my translations with ensembl peptides. differences = [] for x in range(1, len(s1) - 1): if s1[x] != s2[x]: differences.append((s1[x], s2[x])) l = len(differences) # check for Selenocysteins if len([x for x in differences if x[0] == "U" or x[1] == "U"]) == l: ndiff_selenocysteine += 1 status = "selenocysteine" # check for masked residues elif len([x for x in differences if x[0] in "NX" or x[1] in "NX"]) == l: ndiff_masked += 1 status = "masked" # correct for different gap lengths if args.correct_shift: map_a2b = alignlib_lite.py_makeAlignmentVector() a, b = 0, 0 keep = False x = 0 while x < m and not (a == len(s1) and b == len(s2)): try: if s1[a] != s2[b]: while s1[a] == "N" and s2[b] != "N": a += 1 while s1[a] != "N" and s2[b] == "N": b += 1 if s1[a] != s2[b]: break except IndexError: print("# index error for %s: x=%i, a=%i, b=%i, l1=%i, l2=%i" % (k, x, a, b, len(s1), len(s2))) break a += 1 b += 1 map_a2b.addPairExplicit(a, b, 0.0) # check if we have reached the end: else: keep = True nfixed += 1 f = alignlib_lite.py_AlignmentFormatEmissions(map_a2b) print("fix\t%s\t%s" % (k, str(f))) if not keep: print("# warning: not fixable: %s" % k) if write_diff: args.stdout.write("---- %s ---- %s\n" % (k, status)) if write_seqdiff: args.stdout.write("< %s\n> %s\n" % (seqs1[k], seqs2[k])) for k in sorted(list(seqs2.keys())): if k not in found2: nmissed2 += 1 if write_missed2: args.stdout.write("---- %s ---- %s\n" % (k, "missed2")) args.stdlog.write("""# Legend: """) E.info("seqs1=%i, seqs2=%i, same=%i, ndiff=%i, nmissed1=%i, nmissed2=%i" % (len(seqs1), len(seqs2), nsame, ndiff, nmissed1, nmissed2)) E.info( "ndiff=%i: first=%i, last=%i, prefix=%i, selenocysteine=%i, masked=%i, fixed=%i, other=%i" % (ndiff, ndiff_first, ndiff_last, ndiff_prefix, ndiff_selenocysteine, ndiff_masked, nfixed, ndiff - ndiff_first - ndiff_last - ndiff_prefix - ndiff_selenocysteine - ndiff_masked - nfixed)) E.stop()