if link.mQueryToken == link.mSbjctToken: continue if link.mQueryToken in cds and \ link.mSbjctToken in cds: # expand to codons if param_expand: link.mQueryFrom = (link.mQueryFrom - 1) * 3 + 1 link.mSbjctFrom = (link.mSbjctFrom - 1) * 3 + 1 link.mQueryAli = ScaleAlignment(link.mQueryAli, 3) link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3) map_row2col.clear() alignlib_lite.AlignmentFormatExplicit( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli).copy(map_row2col) # test all combinations, the alignment might be a suboptimal alignment in case # of repeats. for e1 in cds[link.mQueryToken]: for e2 in cds[link.mSbjctToken]: tmp_map_row2col.clear() if param_expand: alignlib_lite.copyAlignment( tmp_map_row2col, map_row2col, e1.mPeptideFrom + 1, e1.mPeptideTo, e2.mPeptideFrom + 1, e2.mPeptideTo,
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-b", "--boundaries", dest="filename_boundaries", type="string", help="filename with exon boundaries.") parser.add_option("-e", "--exons", dest="filename_exons", type="string", help="filename with exons (output).") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide sequences.") parser.add_option( "-w", "--write-notfound", dest="write_notfound", action="store_true", help="print exons for predictions not found in reference.") parser.add_option("-q", "--quality-pide", dest="quality_threshold_pide", type="int", help="quality threshold (pide) for exons.") parser.set_defaults( genome_file="genome", filename_boundaries=None, filename_exons=None, filename_peptides=None, quality_threshold_pide=0, write_notfound=False, ## allowed number of nucleotides for exon boundaries to ## be considered equivalent. slipping_exon_boundary=9, ## stop codons to search for stop_codons=("TAG", "TAA", "TGA"), ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) reference_exon_boundaries = {} if options.filename_boundaries: reference_exon_boundaries = Exons.ReadExonBoundaries(open( options.filename_boundaries, "r"), do_invert=1, remove_utr=1) E.info("read exon boundaries for %i queries" % len(reference_exon_boundaries)) if options.filename_exons: outfile_exons = open(options.filename_exons, "w") outfile_exons.write("%s\n" % "\t".join( ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame", "reference_id", "reference_from", "reference_to", "reference_phase", "pidentity", "psimilarity", "nframeshifts", "ngaps", "nstopcodons", "is_ok", "genome_exon_from", "genome_exon_to"))) else: outfile_exons = None if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) E.info("read peptide sequences for %i queries" % len(peptide_sequences)) else: peptide_sequences = {} entry = PredictionParser.PredictionParserEntry() last_filename_genome = None nfound, nmissed_exons, nmissed_length = 0, 0, 0 nempty_alignments = 0 fasta = IndexedFasta.IndexedFasta(options.genome_file) options.stdout.write("%s\n" % "\t".join( ("prediction_id", "number", "dubious_exons", "boundaries_sum", "boundaries_max", "identical_exons", "inserted_exons", "deleted_exons", "inserted_introns", "deleted_introns", "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons", "deleted_Cexons", "inserted_Nexons", "inserted_Cexons"))) for line in sys.stdin: if line[0] == "#": continue try: entry.Read(line) except ValueError, msg: print "# parsing failed with msg %s in line %s" % (msg, line[:-1]) sys.exit(1) exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome, query_from=entry.mQueryFrom, sbjct_from=entry.mSbjctGenomeFrom, add_stop_codon=0) if exons[-1][4] != entry.mSbjctGenomeTo: print "# WARNING: discrepancy in exon calculation!!!" for e in exons: print "#", str(e) print "#", str(entry) if options.loglevel >= 5: for e in exons: print "#", str(e) genomic_fragment = fasta.getSequence(entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) skip = False if peptide_sequences.has_key(entry.mQueryToken): query_sequence = alignlib_lite.makeSequence( peptide_sequences[entry.mQueryToken]) sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation) percent_similarity, percent_identity = 0, 0 if query_sequence.getLength( ) < entry.mMapPeptide2Translation.getRowTo(): print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken, query_sequence.getLength(), entry.mMapPeptide2Translation.getRowTo()) sys.stdout.flush() nmissed_length += 1 skip = True elif sbjct_sequence.getLength( ) < entry.mMapPeptide2Translation.getColTo(): print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken, sbjct_sequence.getLength(), entry.mMapPeptide2Translation.getColTo()) sys.stdout.flush() nmissed_length += 1 skip = True else: alignlib_lite.rescoreAlignment( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence, alignlib_lite.makeScorer(query_sequence, sbjct_sequence)) percent_identity = alignlib_lite.calculatePercentIdentity( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( entry.mMapPeptide2Translation) * 100 E.debug( "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f" % (str(entry.mPredictionId), entry.mPercentSimilarity, entry.mPercentIdentity, percent_similarity, percent_identity)) else: query_sequence = None sbjct_sequence = None # default values exons_num_exons = "na" exons_boundaries_sum = "na" exons_boundaries_max = "na" dubious_exons = "na" ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0 truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0 ndeleted_Nexons, ndeleted_Cexons = 0, 0 ninserted_Nexons, ninserted_Cexons = 0, 0 exons_offset = exons[0][3] if not reference_exon_boundaries.has_key(entry.mQueryToken): print "# WARNING: sequence %s has no exon boundaries" % ( entry.mQueryToken) sys.stdout.flush() nmissed_exons += 1 skip = True if not skip: nfound += 1 ref_exons = reference_exon_boundaries[entry.mQueryToken] ref_exons_offset = ref_exons[0].mGenomeFrom exons_num_exons = len(ref_exons) - len(exons) exons_boundaries_sum = 0 exons_phase = 0 exons_boundaries_max = 0 dubious_exons = 0 inserted_exons = 0 temp_inserted_exons = 0 if options.loglevel >= 3: for e in exons: options.stdlog.write("# %s\n" % str(e)) for e in ref_exons: options.stdlog.write("# %s\n" % str(e)) min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100 in_sync = 0 e, r = 0, 0 while e < len(exons) and r < len(ref_exons): this_e, this_r = e + 1, r + 1 percent_identity = 0 percent_similarity = 0 is_good_exon = 0 if options.loglevel >= 4: options.stdlog.write("# current exons: %i and %i\n" % (e, r)) sys.stdout.flush() exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[ e][0:6] ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = ( ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset ## get percent identity for exon exon_percent_identity = 0 exon_percent_similarity = 0 if query_sequence and sbjct_sequence: tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = exon_from / 3 xquery_to = exon_to / 3 alignlib_lite.copyAlignment(tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to) if tmp_ali.getLength() == 0: options.stdlog.write( "# WARNING: empty alignment %s\n" % str( (ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) nempty_alignments += 1 else: if options.loglevel >= 5: options.stdlog.write("# %s\n" % str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence))) exon_percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence) * 100 exon_percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali) * 100 if exon_percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 if e < len(exons) - 1: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = exons[e + 1][0:6] else: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = 0, 0, 0, 0, 0, [] if r < len(ref_exons) - 1: next_ref_from, next_ref_to, next_ref_phase = ( ref_exons[r + 1].mPeptideFrom, ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame) else: next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0 if options.loglevel >= 2: options.stdlog.write("# %s\n" % "\t".join( map(str, (entry.mQueryToken, exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, ref_from, ref_to, ref_phase)))) sys.stdout.flush() # beware of small exons. # if less than options.slipping_exon_boundary: boundary is 0 # check if end is more than options.splipping_exon_boundary apart as well. if exon_to - exon_from <= options.slipping_exon_boundary or \ ref_to - ref_from <= options.slipping_exon_boundary: boundary = 0 else: boundary = options.slipping_exon_boundary if ref_to <= exon_from + boundary and \ ref_to <= exon_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if e == 0: ndeleted_Nexons += 1 else: ndeleted_exons += 1 r += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0 overlap = 0 elif exon_to <= ref_from + boundary and \ exon_to <= ref_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if r == 0: ninserted_Nexons += 1 else: ninserted_exons += 1 e += 1 ref_from, ref_to, ref_phase = 0, 0, 0 overlap = 0 else: ## overlap overlap = 1 dfrom = int(math.fabs(exon_from - ref_from)) dto = int(math.fabs(exon_to - ref_to)) ## get percent identity for overlapping fragment if query_sequence and sbjct_sequence: ## this the problem tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = max(ref_from / 3, exon_from / 3) xquery_to = min(ref_to / 3, exon_to / 3) alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to) if tmp_ali.getLength() == 0: options.stdlog.write( "# warning: empty alignment %s\n" % str( (ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) percent_identity = 0 percent_similarity = 0 else: if options.loglevel >= 5: print str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence)) percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali) * 100 if percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 dubious_exons += 1 ## adjust regions for terminal exons if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom - 1) * 3 and dfrom > 0: if is_good_exon: truncated_Nterminal_exon = dfrom dfrom = 0 ## truncated terminal exons if e == len(exons) - 1 and r == len( ref_exons) - 1 and dto <= ( entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: if is_good_exon: truncated_Cterminal_exon = dto dto = 0 ## do not count deviations for terminal query exons if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0: dfrom = 0 if e == len(exons) - 1 and dto <= ( entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: dto = 0 ## permit difference of one codon (assumed to be stop) if e == len(exons) - 1 and r == len( ref_exons) - 1 and dto == 3: dto = 0 ## deal with different boundary conditions: if dfrom == 0 and dto == 0: if is_good_exon: nidentical_exons += 1 e += 1 r += 1 ## next exon within this ref_exon elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary: if is_good_exon: ninserted_introns += 1 e += 1 in_sync = 1 dto = 0 ## next ref_exon within this exon elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary: if is_good_exon: ndeleted_introns += 1 r += 1 in_sync = 1 dto = 0 else: e += 1 r += 1 if in_sync: dfrom = 0 if is_good_exon: exons_boundaries_sum += dfrom + dto exons_boundaries_max = max(dfrom, exons_boundaries_max) exons_boundaries_max = max(dto, exons_boundaries_max) ########################################################### ## count inserted/deleted introns and misplaced boundaries ## ## if exon and next_exon in ref_exon: inserted intron ## if ref_exon and next_ref_exon in exon: deleted intron if outfile_exons: if genomic_fragment and exon_genome_to: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment, border_stop_codon=0) else: nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0 if exon_to == 0: this_e = 0 if ref_to == 0: this_r = 0 outfile_exons.write( string.join( map(str, ( entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, this_r, ref_from, ref_to, ref_phase, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, is_good_exon, exon_genome_from, exon_genome_to, )), "\t") + "\n") while e < len(exons): exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[ e][0:5] e += 1 ninserted_Cexons += 1 if outfile_exons: outfile_exons.write( string.join( map(str, ( entry.mPredictionId, e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") while r < len(ref_exons): ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = ( ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ndeleted_Cexons += 1 ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset r += 1 if outfile_exons: outfile_exons.write( string.join( map(str, ( entry.mPredictionId, 0, 0, 0, 0, r, ref_from, ref_to, ref_phase, 0, 0, 0, 0, 0, 0, 0, 0, )), "\t") + "\n") else: if options.write_notfound: this_e = 0 ## use prediction's identity/similarity for exons. ## This will still then flag stop-codons in later analysis percent_identity = entry.mPercentIdentity percent_similarity = entry.mPercentSimilarity for exon in exons: this_e += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[ 0:6] if genomic_fragment: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment) outfile_exons.write( string.join( map(str, ( entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") options.stdout.write("\t".join( map(str, (entry.mPredictionId, exons_num_exons, dubious_exons, exons_boundaries_sum, exons_boundaries_max, nidentical_exons, ninserted_exons, ndeleted_exons, ninserted_introns, ndeleted_introns, truncated_Nterminal_exon, truncated_Cterminal_exon, ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons, ninserted_Cexons))) + "\n")
def WriteExons(token1, peptide1, cds1, transcript1, token2, peptide2, cds2, transcript2, peptide_map_a2b): if param_loglevel >= 3: for cd in cds1: print "#", str(cd) for cd in cds2: print "#", str(cd) print "# peptide_map_a2b", str( alignlib_lite.AlignmentFormatExplicit(peptide_map_a2b)) sys.stdout.flush() dna_map_a2b = Genomics.AlignmentProtein2CDNA(peptide_map_a2b, cds1, cds2) if len(cds1) != len(cds2): if param_loglevel >= 4: print "" # WARNING: different number of exons!" seq1 = alignlib_lite.makeSequence(transcript1) seq2 = alignlib_lite.makeSequence(transcript2) tmp_map_a2b = alignlib_lite.makeAlignmentVector() dialign = WrapperDialign.Dialign("-n") dialignlgs = WrapperDialign.Dialign("-n -it -thr 2 -lmax 30 -smin 8") dba = WrapperDBA.DBA() #clustal = WrapperClustal.Clustal() matrix, gop, gep = global_substitution_matrix alignator_nw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_GLOBAL, gop, gep, matrix) alignator_sw = alignlib_lite.makeAlignatorDPFullDP( alignlib_lite.ALIGNMENT_LOCAL, gop, gep, matrix) # concatenated alignments for exons: # 1: only the common parts ali_common1 = "" ali_common2 = "" e1, e2 = 0, 0 while cds1[e1].mGenomeTo <= dna_map_a2b.getRowFrom(): e1 += 1 while cds2[e2].mGenomeTo <= dna_map_a2b.getColFrom(): e2 += 1 nskipped, nerrors = 0, 0 if param_loglevel >= 5: nmapped = 0 for x in range(dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo() + 1): if dna_map_a2b.mapRowToCol(x) >= 0: nmapped += 1 print "# nmapped=", nmapped print str(alignlib_lite.AlignmentFormatEmissions(dna_map_a2b)) # declare alignments used map_intron_a2b = alignlib_lite.makeAlignmentVector() result = Exons.CompareGeneStructures(cds1, cds2, map_cmp2ref=peptide_map_a2b) if param_loglevel >= 2: print result.Pretty("#") nskipped_exons, nskipped_introns = 0, 0 last_e1, last_e2 = None, None for link in result.mEquivalences: if link.mCoverage <= param_min_exon_coverage: nskipped_exons += 1 continue e1, e2 = link.mId1, link.mId2 c1 = cds1[e1] c2 = cds2[e2] exon_fragment1 = transcript1[c1.mGenomeFrom:c1.mGenomeTo] exon_fragment2 = transcript2[c2.mGenomeFrom:c2.mGenomeTo] ####################################################################### # write unaligned exons if param_write_exons: pair = AlignedPairs.UnalignedPair() pair.mCategory = "exon" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) pair.mLen1 = len(exon_fragment1) pair.mSequence1 = exon_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum2 = len(cds2) pair.mLen2 = len(exon_fragment2) pair.mSequence2 = exon_fragment2 pair.mFrom1, pair.mTo1 = c1.mGenomeFrom, c1.mGenomeTo, pair.mFrom2, pair.mTo2 = c2.mGenomeFrom, c2.mGenomeTo, print str(pair) sys.stdout.flush() ####################################################################### # build alignment for overlap of both exons # tmp_map_a2b.clear() # alignlib_lite.copyAlignment( tmp_map_a2b, dna_map_a2b, # c1.mGenomeFrom + 1, c1.mGenomeTo ) # if param_loglevel >= 5: # print "# alignment: %i-%i" % (c1.mGenomeFrom + 1, c1.mGenomeTo) # for x in alignlib_lite.writeAlignmentTable( tmp_map_a2b ).split("\n"): # print "#", x # if tmp_map_a2b.getLength() == 0: # if param_loglevel >= 1: # print "# WARNING: empty alignment between exon %i (from %i to %i) and exon %i" % \ ## (e1,c1.mGenomeFrom + 1, c1.mGenomeTo, e2) # print "## peptide_map_a2b", peptide_map_a2b.getRowFrom(), peptide_map_a2b.getRowTo(),\ ## peptide_map_a2b.getColFrom(), peptide_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(peptide_map_a2b) # print "## dna_map_a2b", dna_map_a2b.getRowFrom(), dna_map_a2b.getRowTo(),\ ## dna_map_a2b.getColFrom(), dna_map_a2b.getColTo(), \ # Alignlib.writeAlignmentCompressed(dna_map_a2b) # for cd in cds1: print "##", str(cd) # for cd in cds2: print "##", str(cd) ## nerrors += 1 # continue ## data = map(lambda x: x.split("\t"), alignlib_lite.writePairAlignment( seq1, seq2, tmp_map_a2b ).split("\n")) # if "caligned" in param_write_exons : # print "exon\tcaligned\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, e1, ## token2, e2, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) ## ali_common1 += data[0][1] ## ali_common2 += data[1][1] ####################################################################### # write alignment of introns for orthologous introns # orthologous introns are between orthologous exons if param_write_introns: if last_e1 is not None: if e1 - last_e1 != 1 or e2 - last_e2 != 1: nskipped_introns += 1 else: pair = AlignedPairs.UnalignedPair() intron_from1 = cds1[e1 - 1].mGenomeTo intron_to1 = cds1[e1].mGenomeFrom intron_from2 = cds2[e2 - 1].mGenomeTo intron_to2 = cds2[e2].mGenomeFrom intron_fragment1 = transcript1[intron_from1:intron_to1] intron_fragment2 = transcript2[intron_from2:intron_to2] if len(intron_fragment1) == 0 or len( intron_fragment2) == 0: print "## ERROR: empty intron fragments: %i-%i out of %i and %i-%i out of %i." %\ (intron_from1, intron_to1, len(transcript1), intron_from2, intron_to2, len(transcript2)) continue pair.mCategory = "intron" pair.mToken1 = token1 pair.mId1 = e1 + 1 pair.mNum1 = len(cds1) - 1 pair.mLen1 = len(intron_fragment1) pair.mFrom1 = intron_from1 pair.mTo1 = intron_to1 pair.mSequence1 = intron_fragment1 pair.mToken2 = token2 pair.mId2 = e2 + 1 pair.mNum1 = len(cds2) - 1 pair.mLen2 = len(intron_fragment2) pair.mFrom2 = intron_from2 pair.mTo2 = intron_to2 pair.mSequence2 = intron_fragment2 if (param_min_intron_length and len(intron_fragment1) < param_min_intron_length) or \ (param_min_intron_length and len(intron_fragment2) < param_min_intron_length) or \ (param_max_intron_length and len(intron_fragment1) > param_max_intron_length) or \ (param_max_intron_length and len(intron_fragment2) > param_max_intron_length): if param_loglevel >= 1: print "# skipped: fragment lengths out of bounds for: %s\t%s\t%s\t%s\t%i\t%i" %\ (token1, e1, token2, e2, len(intron_fragment1), len(intron_fragment2)) sys.stdout.flush() nskipped += 1 print str(pair) # else: ## anchored_from1 = intron_from1 - param_extend_introns ## anchored_to1 = intron_to1 + param_extend_introns ## anchored_from2 = intron_from2 - param_extend_introns ## anchored_to2 = intron_to2 + param_extend_introns ## anchored_fragment1 = transcript1[anchored_from1:anchored_to1] ## anchored_fragment2 = transcript2[anchored_from2:anchored_to2] # for method in param_write_introns: # if param_loglevel >= 2: # print "## aligning with method %s" % method # sys.stdout.flush # map_intron_a2b.clear() # if method == "unaligned": ## from1, to1, ali1, from2, to2, ali2 = 0, 0, intron_fragment1, 0, 0, intron_fragment2 # elif method in ("dialigned", "dbaligned", "clusaligned", "dialignedlgs"): ## tmp_intron_a2b = alignlib_lite.makeAlignmentVector() # if param_loglevel >= 1: # print "# aligning with method %s two fragments of length %i and %i" % (method, # len(anchored_fragment1), # len(anchored_fragment2)) # sys.stdout.flush() # if method == "dialigned": ## result = dialign.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dialignedlgs": ## result = dialignlgs.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "dbaligned": ## result = dba.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # elif method == "clusaligned": ## result = clustal.Align( anchored_fragment1, anchored_fragment2, tmp_intron_a2b ) # if not result or result.getLength() == 0: # if param_loglevel >= 1: # print "# Error: empty intron alignment" # sys.stdout.flush() ## nerrors += 1 # continue ## tmp_intron_a2b.moveAlignment( anchored_from1, anchored_from2 ) # alignlib_lite.copyAlignment( map_intron_a2b, tmp_intron_a2b, ## intron_from1 + 1, intron_to1, # intron_from2 + 1, intron_to2 ) # elif method == "nwaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignator_nw.Align( seq1, seq2, map_intron_a2b ) # seq1.useFullLength() # seq2.useFullLength() # elif method == "swaligned": ## seq1.useSegment( cds1[e1-1].mGenomeTo + 1, cds1[e1].mGenomeFrom ) ## seq2.useSegment( cds2[e2-1].mGenomeTo + 1, cds2[e2].mGenomeFrom ) ## alignlib_lite.performIterativeAlignment( map_intron_a2b, seq1, seq2, alignator_sw, param_min_score_sw ) # seq1.useFullLength() # seq2.useFullLength() # else: ## raise "unknown method %s" % method # if map_intron_a2b.getLength() > 0: # if param_compress: ## from1, to1 = map_intron_a2b.getRowFrom(), map_intron_a2b.getRowTo() ## from2, to2 = map_intron_a2b.getColFrom(), map_intron_a2b.getColTo() ## ali1, ali2 = Alignlib.writeAlignmentCompressed( map_intron_a2b ) # else: # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, map_intron_a2b ).split("\n")) # if len(data) < 2: ## data=[ ( 0, "", 0), (0, "", 0)] ## from1, ali1, to1 = data[0] ## from2, ali2, to2 = data[1] # print string.join(map(str, ("intron", # method, ## token1, e1, len(cds1) - 1, len(intron_fragment1), ## token2, e2, len(cds2) - 1, len(intron_fragment2), # map_intron_a2b.getNumGaps(), # map_intron_a2b.getLength(), ## map_intron_a2b.getLength() - map_intron_a2b.getNumGaps(), ## from1, to1, ali1, ## from2, to2, ali2, ## intron_from1, intron_to1, # intron_from2, intron_to2)), "\t") # sys.stdout.flush() last_e1, last_e2 = e1, e2 ########################################################################## # write concatenated exons # for method in param_write_exons: # if method == "common": # print "exon\tcommon\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## 0, 0, ## 0, 0, # ali_common1, ali_common2 ) # elif method == "exons": # Write full alignment without gaps. # This will not care about exon boundaries and gaps. # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) # try: ## from1, s1, to1, from2, s2, to2 = data[0] + data[1] # except ValueError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # except IndexError: ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" ## nerrors += 1 # if from1: # if len(s1) != len(s2): # print "# WARNING: alignment of different lengths: %i and %i" % (len(s1), len(s2)) ## nerrors += 1 ## from1, to1, from2, to2 = 0, 0, 0, 0 ## s1, s2 = "", "" # else: ## a1, a2 = [], [] # for x in range( min(len(s1), len(s2)) ): # if s1[x] != "-" and s2[x] != "-": ## a1.append( s1[x] ) ## a2.append( s2[x] ) ## s1 = string.join(a1, "") ## s2 = string.join(a2, "") # print "exon\texons\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( (token1, 0, ## token2, 0, ## from1, to1, ## from2, to2, # s1, s2 ) ) # elif method == "full": # write full alignment (do not care about exon boundaries) # data = map(lambda x: x.split("\t"), # alignlib_lite.writePairAlignment( seq1, seq2, dna_map_a2b ).split("\n")) ## if len(data) < 2: data=[ ( 0, "", 0), (0, "", 0)] # print "exon\tfull\t%s\t%i\t%s\t%i\t%s\t%s\t%s\t%s\t%s\t%s" % ( token1, 0, ## token2, 0, ## data[0][0], data[0][2], ## data[1][0], data[1][2], # data[0][1], data[1][1] ) if param_loglevel >= 3: print "# skipped_exons=%i, skipped_introns=%i" % (nskipped_exons, nskipped_introns) return nerrors, nskipped
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser.add_option("-m", "--filename-map", dest="filename_map", type="string", help="filename with mapping information.") parser.add_option("-o", "--pattern-old", dest="pattern_old", type="string", help="pattern for mapping new to old identifiers: extract string from old.") parser.add_option("-n", "--pattern-new", dest="pattern_new", type="string", help="pattern for mapping new to old identifiers: put string into new.") parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="genome_file.") parser.add_option("-p", "--peptides", dest="filename_peptides", type = "string", help="filename with peptide sequences.") parser.add_option("-f", "--input-format", dest="input_format", type="choice", help="format of mapping file", choices=("alignment", "offsets") ) parser.add_option("-i", "--write-missed", dest="write_missed", type="string", help="write missed identifiers to separate file.") parser.add_option("-a", "--filename-genes", dest="filename_genes", type="string", help="filename with gene information.") parser.add_option("--filename-old-peptides", dest="filename_old_peptides", type="string", help="filename with old peptide information.") parser.add_option("--no-renumber", dest="renumber", action="store_false", help="do not renumber predictions.") parser.add_option("--contig-sizes-old", dest="contig_sizes_old", type="string", help="contig sizes for old data.") parser.add_option("--contig-sizes-new", dest="contig_sizes_new", type="string", help="contig sizes for new data.") parser.add_option("--skip-errors", dest="skip_errors", action="store_true", help="skip entries with errors.") parser.set_defaults( filename_map = None, pattern_old = "(.+)", pattern_new = "%s", genome_file = None, filename_peptides = None, write_missed = None, filename_genes = None, filename_old_peptides = None, renumber = True, input_format = "alignment", contig_sizes_old = None, contig_sizes_new = None, skip_errors = None ) (options, args) = E.Start( parser, add_pipe_options = True) predictor = PredictorExonerate() ## the different mapping criteria map_sbjcts = {} breakpoints = {} ################################################################################################ map_transcript2gene = {} if options.filename_genes: infile = open(options.filename_genes, "r") for gene, transcript in map( lambda x: x[:-1].split("\t")[:2], filter( lambda x: x[0] != "#", infile.readlines())): map_transcript2gene[transcript] = gene infile.close() ################################################################################################ peptides = {} if options.filename_peptides: peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) options.stdlog.write( "# read %i peptide sequences.\n" % len(peptides)) ################################################################################################ ## read old query sequences and compare against new query sequences ## this can be used to build a map between old and new queries query_map_old2new = {} if options.filename_old_peptides: old_peptides = Genomics.ReadPeptideSequences( open(options.filename_old_peptides, "r")) options.stdlog.write( "# read %i old peptide sequences.\n" % len(old_peptides)) query_map_old2new, unmappable, unmapped = Genomics.MapSequences( old_peptides, peptides) options.stdlog.write( "# built map: unmappable=%i unmapped=%i.\n" % (len(unmappable), len(unmapped))) if options.loglevel >= 2: options.stdlog.write( "# unmappable: %s.\n" % ";".join(unmappable)) options.stdlog.write( "# unmapped: %s.\n" % ";".join(unmapped)) ################################################################################################ ## read old/new contig sizes for mapping positive/negative coordinates contig_sizes_old = {} contig_sizes_new = {} if options.contig_sizes_old: contig_sizes_old = Genomics.ReadContigSizes( open(options.contig_sizes_old, "r") ) if options.contig_sizes_new: contig_sizes_new = Genomics.ReadContigSizes( open(options.contig_sizes_new, "r") ) ################################################################################################ if options.filename_map: infile = open(options.filename_map) if options.input_format == "alignments": for line in infile: if line[0] == "#": continue x, old_token, old_from, old_to, old_ali, new_from, new_to, new_ali = line[:-1].split("\t") map_sbjcts[old_token] = (old_from, old_ali, new_from, new_ali) if options.loglevel >= 1: options.stdlog.write( "# read %i alignments.\n" % len(map_sbjcts)) elif options.input_format == "offsets": ## input is a list of segments and their offsets. breakpoints, endpoints, offsets = ReadOffsets( infile ) if options.loglevel >= 1: options.stdlog.write( "# read breakpoints for %i chromosomes.\n" % len(breakpoints)) infile.close() ################################################################################################ ################################################################################################ ################################################################################################ ## end of input section ################################################################################################ ################################################################################################ ################################################################################################ rx = re.compile(options.pattern_old) last_sbjct_token = None ninput = 0 nerrors = 0 nerrors_map = 0 nerrors_inconsistencies = 0 nerrors_boundaries = 0 nerrors_translation = 0 nerrors_inconsequential = 0 nerrors_realigned = 0 nmapped = 0 nfiltered = 0 naligned = 0 noutput = 0 found_transcripts = {} nduplicates = 0 output = {} for line in sys.stdin: if line[0] == "#": continue entry = PredictionParser.PredictionParserEntry() entry.Read( line ) ninput += 1 is_positive = entry.mSbjctStrand == "+" is_error = False ## check if query token is mappable: using sequence map if (query_map_old2new and entry.mQueryToken not in query_map_old2new): options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) ) nfiltered += 1 continue else: ## check if query token is mappable: using filter if (peptides and entry.mQueryToken not in peptides): options.stdlog.write("# skipping prediction %i: obsolete query %s\n" % (entry.mPredictionId, entry.mQueryToken) ) nfiltered += 1 continue new_sbjct_token = options.pattern_new % rx.search(entry.mSbjctToken).groups()[0] ########################################################################################################## ## Map via alignments if entry.mSbjctToken in map_sbjcts: nmapped += 1 if last_sbjct_token != entry.mSbjctToken: old_from, old_ali, new_from, new_ali = map_sbjcts[entry.mSbjctToken] map_a2b = alignlib_lite.makeAlignmentVector() alignlib_lite.AlignmentFormatExplicit( int(old_from), old_ali, int(new_from), new_ali).copy( map_a2b ) last_sbjct_token = entry.mSbjctToken if options.loglevel >= 3: print "#", str(entry) print "#", map_sbjcts[entry.mSbjctToken] sys.stdout.flush() old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ## convert to forward coordinates: if is_positive: f, t= old_f, old_t first_res, last_res = f + 1, t else: f, t = map_a2b.getRowTo() - old_f, map_a2b.getRowTo() - old_t first_res, last_res = f, t + 1 ## map first and last residues mfirst_res = map_a2b.mapRowToCol( first_res ) mlast_res = map_a2b.mapRowToCol( last_res ) if (mfirst_res == 0 and old_f != 0) or (mlast_res == 0 and old_t != map_a2b.getRowTo() ): options.stderr.write("# mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, f, t)) options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.write("# %s\n" % str(entry)) options.stderr.flush() nerrors_boundaries += 1 is_error = True ## get extended boundaries for alignment later on while mfirst_res == 0 and first_res > 1: first_res -= 1 mfirst_res = map_a2b.mapRowToCol(first_res) while mlast_res == 0 and last_res < map_a2b.getRowTo(): last_res += 1 mlast_res = map_a2b.mapRowToCol(last_res) ## convert to genomic coordinates ## convert negative strand coordinates if is_positive: new_f = mfirst_res - 1 new_t = mlast_res else: new_f = mfirst_res new_t = mlast_res - 1 new_f = map_a2b.getColTo() - new_f new_t = map_a2b.getColTo() - new_t ## Now map the alignment. try: MapAlignment( entry, map_a2b ) except ValueError: options.stderr.write("# alignment mapping not possible for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, new_f, new_t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.flush() nerrors_map += 1 is_error= True if new_f != entry.mSbjctGenomeFrom or new_t != entry.mSbjctGenomeTo: options.stderr.write("# mapping inconsistency for prediction %i on %s %s:%i-%i -> %i-%i -> %i-%i -> %i-%i -> %i-%i <> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, first_res, last_res, mfirst_res, mlast_res, new_f, new_t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) nerrors_inconsistencies += 1 is_error = True ########################################################################################################## ## Map via offsets if entry.mSbjctToken in breakpoints: old_f, old_t = entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo ## convert to forward coordinates: if is_positive: f, t= old_f, old_t else: f, t = contig_sizes_old[entry.mSbjctToken] - old_t, contig_sizes_old[entry.mSbjctToken] - old_f o1 = GetOffset( f, breakpoints[entry.mSbjctToken], endpoints[entry.mSbjctToken], offsets[entry.mSbjctToken] ) o2 = GetOffset( t, breakpoints[entry.mSbjctToken], endpoints[entry.mSbjctToken], offsets[entry.mSbjctToken] ) if o1 != o2: options.stderr.write("# break within gene %s\n" % str(entry)) nerrors_map += 1 is_error = True f += o1 t += o2 if not is_positive: f, t = contig_sizes_new[entry.mSbjctToken] - t, contig_sizes_new[entry.mSbjctToken] - f entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo = f, t if entry.mSbjctGenomeFrom > entry.mSbjctGenomeTo: options.stderr.write("# mapping error: start after end %s\n" % str(entry)) nerrors_map += 1 is_error = True ########################################################################################################## ## do translation check, if genome is given if options.genome_file: genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo, options.genome_file, loglevel = 0) map_peptide2translation, translation = Genomics.Alignment2PeptideAlignment( \ entry.mMapPeptide2Genome, entry.mQueryFrom, 0, genomic_sequence ) if re.sub("X", "", translation) != re.sub("X", "", entry.mTranslation): options.stderr.write("# translation error for prediction %i on %s %s:%i-%i -> %i-%i <> %i-%i\n" % \ (entry.mPredictionId, entry.mSbjctToken, entry.mSbjctStrand, old_f, old_t, f, t, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) if map_sbjcts: options.stderr.write("# %s\n" % str(map_sbjcts[entry.mSbjctToken])) options.stderr.write("# old=%s\n# new=%s\n" % (entry.mTranslation, translation)) options.stderr.write("# old=%s\n# new=%s\n" % (entry.mAlignmentString, Genomics.Alignment2String(entry.mMapPeptide2Genome))) nerrors_translation += 1 is_error = True if peptides and entry.mQueryToken in peptides: naligned += 1 options.stdlog.write( "# aligning: %s versus %s:%s: %i-%i\n" % ( \ entry.mQueryToken, new_sbjct_token, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo)) # do a quick reprediction if entry.mQueryToken in peptides: genomic_sequence = Genomics.GetGenomicSequence( new_sbjct_token, entry.mSbjctStrand, 0, 0, genome_file = options.genome_pattern, loglevel = 0) predictor.mLogLevel = 0 result = predictor(entry.mQueryToken, peptides[entry.mQueryToken], entry.mSbjctToken, genomic_sequence, "--exhaustive --subopt FALSE --score '%s' " % str(80), new_f - 10, new_t + 10) prediction_id = entry.mPredictionId if result: entry = result[0] entry.mPredictionId = prediction_id nerrors_realigned += 1 else: if is_error: nerrors_inconsequential += 1 entry.mSbjctToken = new_sbjct_token ## map query tokens if query_map_old2new: query_tokens = query_map_old2new[entry.mQueryToken] else: query_tokens = (entry.mQueryToken,) if options.skip_errors and is_error: continue for query_token in query_tokens: entry.mQueryToken = query_token prediction_id = entry.mPredictionId entry.mPredictionId = 0 hid = Genomics.GetHID( str(entry) ) if hid in output: nduplicates += 1 continue noutput += 1 if options.renumber: prediction_id = noutput entry.mPredictionId = prediction_id options.stdout.write( str(entry) + "\n") options.stdout.flush() found_transcripts[entry.mQueryToken] = 1 ## write out found transcripts and genes nmissed_transcripts = 0 missed_transcripts = [] found_genes = {} if peptides: for x in peptides.keys(): if x not in found_transcripts: nmissed_transcripts += 1 missed_transcripts.append( x ) else: found_genes[map_transcript2gene[x]] = 1 missed_genes = {} nmissed_genes = 0 if map_transcript2gene: for t in missed_transcripts: g = map_transcript2gene[t] if g not in found_genes: missed_genes[g] = 1 nmissed_genes = len(missed_genes) if options.write_missed: outfile = open(options.write_missed, "w") for x in missed_transcripts: if x in unmapped: status = "unmapped" else: status = "mapped" outfile.write( "%s\t%s\t%s\n" % ("transcript", x, status )) for x in missed_genes: status = "unknown" outfile.write( "%s\t%s\t%s\n" % ("gene", x, status )) outfile.close() options.stdlog.write("# input=%i, output=%i, filtered=%i, nduplicates=%i, mapped=%i, errors=%i\n" % (\ ninput, noutput, nfiltered, nduplicates, nmapped, nerrors )) options.stdlog.write("# errors: inconsequental=%i, boundaries=%i, mapping=%i, inconsistencies=%i, translation=%i, realigned=%i\n" % (\ nerrors_inconsequential, nerrors_boundaries, nerrors_map, nerrors_inconsistencies, nerrors_translation, nerrors_realigned )) options.stdlog.write("# peptides: input=%i, found=%i, missed=%i, found_genes=%i, missed_genes=%i\n" % (\ len(peptides), len(found_transcripts), nmissed_transcripts, len(found_genes), nmissed_genes) ) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/prediction2pairs.py 2031 2008-07-15 09:19:05Z andreas $", usage = globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "-c", "--cds", dest="filename_cds", type="string", help="filename with cds seguences." ) parser.add_option( "-f", "--format", dest="format", type="choice", choices=("paired_fasta", ), help="output format, valid options are: paired_fasta: concatenated pairwise alignments in FASTA format" ) parser.set_defaults( genome_file = "genome", filename_cds = "cds.fasta", format = "paired_fasta", filename_suffix = ".fasta", filename_prefix = "", ) (options, args) = E.Start( parser, add_psql_options = True ) if len(args) > 0: print USAGE, "no arguments required." sys.exit(1) fasta = IndexedFasta.IndexedFasta( options.genome_file ) ## reading CDS sequences if options.filename_cds: cds_sequences = Genomics.ReadPeptideSequences( open(options.filename_cds, "r") ) else: cds_sequences = {} if options.loglevel >= 1: options.stdlog.write( "# read %i CDS sequences\n" % len(cds_sequences) ) last_filename_genome = None p = PredictionParser.PredictionParserEntry() ninput, noutput, nsanity, n3, nlength = 0, 0, 0, 0, 0 for line in options.stdin: if line[0] == "#": continue if line[0] == '"': continue p.Read(line) ninput += 1 genomic_fragment = fasta.getSequence( p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo ) if len(genomic_fragment) == 0: raise "ERROR: empty fragment %s:%s for line" % (p.mSbjctGenomeFrom, p.mSbjctGenomeTo), line try: cds_fragment = cds_sequences[p.mQueryToken] except KeyError: options.stdlog.write( "# ERROR: cds not found: query %s.\n" % p.mQueryToken ) continue map_query2sbjct, genomic_fragment = Genomics.Alignment2CDNA( p.mMapPeptide2Genome, query_from = p.mQueryFrom, sbjct_from = 0, genome = genomic_fragment ) ## check for errors: if map_query2sbjct.getRowTo() != p.mQueryTo * 3: options.stdlog.write( "# ERROR: boundary shift in query at line %s\n# %i %i\n" % (line, map_query2sbjct.getRowTo(), p.mQueryTo * 3 ) ) if map_query2sbjct.getColTo() > len(genomic_fragment): options.stdlog.write( "# ERROR: length mismatch in line %s\n# genomic fragment (%i) shorter than last aligned residue (%i)\n" %\ (line, len(genomic_fragment), map_query2sbjct.getColTo()) ) options.stdlog.write( "# cds %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment )) nlength += 1 continue if map_query2sbjct.getRowTo() > len(cds_fragment): options.stdlog.write( "# ERROR: length mismatch in line %s\n# cds fragment (%i) shorter than last aligned residue (%i)\n" %\ (line, len(cds_fragment), map_query2sbjct.getRowTo()) ) options.stdlog.write( "# cds %s\n# genomic %s\n" % (str( cds_fragment ), genomic_fragment )) nlength += 1 continue cds_seq = alignlib_lite.makeSequence( cds_fragment ) genomic_seq = alignlib_lite.makeSequence( genomic_fragment ) f = alignlib_lite.AlignmentFormatExplicit( map_query2sbjct, cds_seq, genomic_seq ) row_ali = f.mRowAlignment col_ali = f.mColAlignment row_ali, col_ali = Genomics.RemoveFrameShiftsFromAlignment(row_ali, col_ali) row_ali = Genomics.MaskStopCodons( row_ali ) col_ali = Genomics.MaskStopCodons( col_ali ) if len(row_ali) != len(col_ali): options.stdlog.write( "# ERROR: wrong alignment lengths.\n" ) sys.exit(1) if len(row_ali) % 3 or len(col_ali) % 3: options.stdlog.write( "# ERROR: sequences are not a multiple of 3 in line: %s\n" % line ) options.stdlog.write( "# %6i %s\n# %6i %s\n" % (len(row_ali), str(row_ali), len(col_ali), str(col_ali) ) ) n3 += 1 input = re.sub( "[-X]", "", p.mTranslation ) ref = re.sub( "[-X]", "", Genomics.TranslateDNA2Protein( col_ali ) ) if input != ref: if options.loglevel >= 1: options.stdlog.write("# sanity check failed for %s - %s\n# %6i %s\n# %6i %s\n" % (p.mPredictionId, p.mQueryToken, len(input), input, len(ref), ref ) ) nsanity += 1 continue options.stdout.write( ">%s\n%s\n" % (p.mPredictionId, row_ali) ) options.stdout.write( ">%s_vs_%s_%s_%i_%i\n%s\n" % \ (p.mQueryToken, p.mSbjctToken, p.mSbjctStrand, p.mSbjctGenomeFrom, p.mSbjctGenomeTo, col_ali) ) noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nsanity=%i, nlength=%i, n3=%i\n" % (ninput, noutput, nsanity, nlength, n3) ) E.Stop()
def EliminateRedundantEntries( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality = None, this_quality = None ): """eliminate redundant entries in a set.""" eliminated = [] rep_id = rep.transcript_id rep_coverage, rep_pid = rep.mQueryCoverage, rep.mPid alignator = alignlib_lite.makeAlignatorDPFull( alignlib_lite.ALIGNMENT_LOCAL, options.gop, options.gep ) result = alignlib_lite.makeAlignmentVector() rep_seq = peptides[rep_id] rep_extended_seq = extended_peptides[rep_id] for entry in data: mem_id, mem_coverage, mem_pid, mem_quality = ( entry.transcript_id, entry.mQueryCoverage, entry.mPid, entry.mQuality ) mem_seq = peptides[mem_id] mem_extended_seq = extended_peptides[mem_id] if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s\n" % (mem_id, mem_quality)) if mem_id in eliminated_predictions: continue if mem_extended_seq == rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "i") ) elif mem_extended_seq in rep_extended_seq: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "p") ) else: if mem_quality != this_quality or \ mem_quality in options.quality_exclude_same: seq1 = alignlib_lite.makeSequence( str(rep_seq) ) seq2 = alignlib_lite.makeSequence( str(mem_seq) ) alignator.align( result, seq1, seq2 ) if options.loglevel >= 5: options.stdlog.write( "# ali\n%s\n" % alignlib_lite.AlignmentFormatExplicit( result, seq1, seq2 ) ) pidentity = 100 * alignlib_lite.calculatePercentIdentity( result, seq1, seq2 ) num_gaps = result.getNumGaps() if options.loglevel >= 4: options.stdlog.write( "# processing: id=%s class=%s pid=%5.2f rep_cov=%i mem_cov=%i\n" %\ ( mem_id, mem_quality, pidentity, rep_coverage, mem_coverage ) ) if pidentity >= options.min_identity: keep = False if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: keep = True reason = "covpid" elif num_gaps >= options.max_gaps and \ mem_coverage > rep_coverage - options.safety_coverage: keep = True reason = "gaps" elif mem_coverage >= rep_coverage - options.safety_coverage and \ 100 * (result.getColTo() - result.getColFrom()) / len(mem_seq) < options.max_member_coverage: keep = True reason = "memcov" if keep: options.stdlog.write( "# WARNING: not removing possibly good prediction: %s: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (reason, rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "h") ) elif pidentity >= options.min_identity_non_genes and \ this_quality in options.quality_genes and \ mem_quality not in options.quality_genes: if rep_coverage < mem_coverage - options.safety_coverage or \ rep_pid < mem_pid - options.safety_pide: options.stdlog.write( "# WARNING: not removing possibly good prediction: rep = %s, mem = %s, rep_cov=%i, rep_pid=%i, mem_cov=%i, mem_pid=%i\n" %\ (rep_id, mem_id, rep_coverage, rep_pid, mem_coverage, mem_pid) ) else: eliminated_predictions[mem_id] = rep_id eliminated.append( (mem_id, "l") ) return eliminated