def ClusterByExonCorrespondence(lengths={}, peptide_sequences=None): exons = Exons.ReadExonBoundaries(sys.stdin) if param_loglevel >= 1: print "# read exons for %i transcripts" % len(exons) if not lengths: for k in exons: lengths[k] = (exons[k][0].mPeptideTo / 3) + 1 for e in exons[k][1:]: lengths[k] = max(lengths[k], (e.mPeptideTo / 3) + 1) if param_loglevel >= 1: print "# lengths for %i transcripts" % len(lengths) map_region2transcript = {} map_transcript2region = {} map_transcript2transcript = {} ## build map of regions to transcripts for t in exons: map_transcript2region[t] = [] for e in exons[t]: r = "%s-%s-%i-%i" % (e.mSbjctToken, e.mSbjctStrand, e.mGenomeFrom, e.mGenomeTo) if r not in map_region2transcript: map_region2transcript[r] = [] map_region2transcript[r].append(t) map_transcript2region[t].append(r) ## build map of transcript to transcript map_transcript2transcript = {} for t in map_transcript2region: map_transcript2transcript[t] = [] for r in map_transcript2region[t]: for tt in map_region2transcript[r]: map_transcript2transcript[t].append(tt) for t in map_transcript2transcript: map_transcript2transcript[t].sort() l = None n = [] for tt in map_transcript2transcript[t]: if t == tt: continue if l != tt: n.append(tt) l = tt map_transcript2transcript[t] = n ## cluster greedily, take longest transcript cluster_id = 1 for t in map_transcript2region: if t not in map_transcript2transcript: continue cluster = CollectCluster(map_transcript2transcript, t) PrintCluster(cluster, cluster_id, lengths, peptide_sequences, param_regex_preferred) cluster_id += 1 if param_loglevel >= 1: print "# RESULT: %i transcripts in %i genes" % ( len(map_transcript2region), cluster_id - 1)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/exons2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", help="method to apply.", type="choice", choices=("remove-stop", )) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed).") parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true", help="work in forward coordinates.") parser.set_defaults(method=None, forward_coordinates=False, genome_file=None) (options, args) = E.Start(parser) if options.method == "remove-stop" and not options.genome_file: raise "please supply genome file for method %s" % options.method if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) contig_sizes = fasta.getContigSizes() exons = Exons.ReadExonBoundaries(sys.stdin, contig_sizes=contig_sizes) else: exons = Exons.ReadExonBoundaries(sys.stdin) ninput, noutput, nremoved_stops, nremoved_exons = 0, 0, 0, 0 for id, ee in exons.items(): if options.loglevel >= 3: for e in ee: options.stdlog.write("# %s\n" % str(e)) if options.method == "remove-stop": e = ee[-1] d = min(3, e.mPeptideTo - e.mPeptideFrom) if d < 3: codon2 = fasta.getSequence(e.mSbjctToken, e.mSbjctStrand, e.mGenomeTo - d, e.mGenomeTo) prev_e = ee[-2] codon1 = fasta.getSequence(prev_e.mSbjctToken, prev_e.mSbjctStrand, prev_e.mGenomeTo - (3 - d), prev_e.mGenomeTo) codon = codon1 + codon2 else: codon = fasta.getSequence(e.mSbjctToken, e.mSbjctStrand, e.mGenomeTo - d, e.mGenomeTo) if codon.upper() in Genomics.StopCodons: if d < 3: nremoved_exons += 1 d = 3 - d del ee[-1] e = ee[-1] e.mGenomeTo -= d e.mPeptideTo -= d nremoved_stops += 1 if e.mGenomeTo == e.mGenomeFrom: nremoved_exons += 1 del ee[-1] e = ee[-1] assert (e.mGenomeTo > e.mGenomeFrom) assert (e.mPeptideTo > e.mPeptideFrom) if options.forward_coordinates: l = contig_sizes[ee[0].mSbjctToken] for e in ee: e.InvertGenomicCoordinates(l) for e in ee: options.stdout.write(str(e) + "\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nremoved_stops=%i, nremoved_exons=%i\n" % (ninput, noutput, nremoved_stops, nremoved_exons)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: links2fasta.py 2446 2009-01-27 16:32:35Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--sequences", dest="filename_sequences", type="string", help="peptide sequence [Default=%default]") parser.add_option("-f", "--format", dest="format", type="string", help="output format [Default=%default]") parser.add_option( "-e", "--expand", dest="expand", action="store_true", help= "expand positions from peptide to nucleotide alignment [Default=%default]" ) parser.add_option("-m", "--map", dest="filename_map", type="string", help="map alignments [Default=%default]") parser.add_option("-c", "--codons", dest="require_codons", action="store_true", help="require codons [Default=%default]") parser.add_option( "--one-based-coordinates", dest="one_based_coordinates", action="store_true", help= "expect one-based coordinates. The default are zero based coordinates [Default=%default]." ) parser.add_option("--no-identical", dest="no_identical", action="store_true", help="do not output identical pairs [Default=%default]") parser.add_option( "-g", "--no-gaps", dest="no_gaps", action="store_true", help="remove all gaps from aligned sequences [Default=%default]") parser.add_option("-x", "--exons", dest="filename_exons", type="string", help="filename with exon boundaries [Default=%default]") parser.add_option("-o", "--outfile", dest="filename_outfile", type="string", help="filename to save links [Default=%default]") parser.add_option("--min-length", dest="min_length", type="int", help="minimum length of alignment [Default=%default]") parser.add_option( "--filter", dest="filename_filter", type="string", help= "given a set of previous alignments, only write new pairs [Default=%default]." ) parser.set_defaults(filename_sequences=None, filename_exons=None, filename_map=None, filename_outfile=None, no_gaps=False, format="fasta", expand=False, require_codons=False, no_identical=False, min_length=0, report_step=100, one_based_coordinates=False, filename_filter=None) (options, args) = E.Start(parser, add_mysql_options=True) t0 = time.time() if options.filename_sequences: sequences = Genomics.ReadPeptideSequences( open(options.filename_sequences, "r")) else: sequences = {} if options.loglevel >= 1: options.stdlog.write("# read %i sequences\n" % len(sequences)) sys.stdout.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r")) else: exons = {} if options.loglevel >= 1: options.stdlog.write("# read %i exons\n" % len(exons)) sys.stdout.flush() if options.filename_map: map_old2new = {} for line in open(options.filename_map, "r"): if line[0] == "#": continue m = Map() m.read(line) map_old2new[m.mToken] = m else: map_old2new = {} if options.loglevel >= 1: options.stdlog.write("# read %i maps\n" % len(map_old2new)) sys.stdout.flush() if options.filename_filter: if options.loglevel >= 1: options.stdlog.write("# reading filtering information.\n") sys.stdout.flush() map_pair2hids = {} if os.path.exists(options.filename_filter): infile = open(options.filename_filter, "r") iterator = FastaIterator.FastaIterator(infile) while 1: cur_record = iterator.next() if cur_record is None: break record1 = cur_record cur_record = iterator.next() if cur_record is None: break record2 = cur_record identifier1 = re.match("(\S+)", record1.title).groups()[0] identifier2 = re.match("(\S+)", record2.title).groups()[0] id = "%s-%s" % (identifier1, identifier2) s = Genomics.GetHID(record1.sequence + ";" + record2.sequence) if id not in map_pair2hids: map_pair2hids[id] = [] map_pair2hids[id].append(s) infile.close() if options.loglevel >= 1: options.stdlog.write( "# read filtering information for %i pairs.\n" % len(map_pair2hids)) sys.stdout.flush() else: map_pair2hids = None if options.loglevel >= 1: options.stdlog.write("# finished input in %i seconds.\n" % (time.time() - t0)) if options.filename_outfile: outfile = open(options.filename_outfile, "w") else: outfile = None map_row2col = alignlib_lite.py_makeAlignmentVector() tmp1_map_row2col = alignlib_lite.py_makeAlignmentVector() counts = {} iterations = 0 t1 = time.time() ninput, nskipped, noutput = 0, 0, 0 for link in BlastAlignments.iterator_links(sys.stdin): iterations += 1 ninput += 1 if options.loglevel >= 1: if (iterations % options.report_step == 0): options.stdlog.write("# iterations: %i in %i seconds.\n" % (iterations, time.time() - t1)) sys.stdout.flush() if link.mQueryToken not in sequences or \ link.mSbjctToken not in sequences: nskipped += 1 continue if options.loglevel >= 3: options.stdlog.write("# read link %s\n" % str(link)) row_seq = alignlib_lite.py_makeSequence(sequences[link.mQueryToken]) col_seq = alignlib_lite.py_makeSequence(sequences[link.mSbjctToken]) if options.one_based_coordinates: link.mQueryFrom -= 1 link.mSbjctFrom -= 1 if options.expand: link.mQueryFrom = link.mQueryFrom * 3 link.mSbjctFrom = link.mSbjctFrom * 3 link.mQueryAli = ScaleAlignment(link.mQueryAli, 3) link.mSbjctAli = ScaleAlignment(link.mSbjctAli, 3) map_row2col.clear() alignlib_lite.py_AlignmentFormatEmissions( link.mQueryFrom, link.mQueryAli, link.mSbjctFrom, link.mSbjctAli).copy(map_row2col) if link.mQueryToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mQueryToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in row with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mQueryToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_old2new[link.mQueryToken].mMapOld2New, map_row2col, alignlib_lite.py_RR) map_old2new[link.mQueryToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) if link.mSbjctToken in map_old2new: tmp1_map_row2col.clear() map_old2new[link.mSbjctToken].expand() if options.loglevel >= 3: options.stdlog.write("# combining in col with %s\n" % str( alignlib_lite.py_AlignmentFormatEmissions( map_old2new[link.mSbjctToken].mMapOld2New))) alignlib_lite.py_combineAlignment( tmp1_map_row2col, map_row2col, map_old2new[link.mSbjctToken].mMapOld2New, alignlib_lite.py_CR) map_old2new[link.mSbjctToken].clear() alignlib_lite.py_copyAlignment(map_row2col, tmp1_map_row2col) dr = row_seq.getLength() - map_row2col.getRowTo() dc = col_seq.getLength() - map_row2col.getColTo() if dr < 0 or dc < 0: raise ValueError( "out of bounds alignment: %s-%s: alignment out of bounds. row=%i col=%i ali=%s" % (link.mQueryToken, link.mSbjctToken, row_seq.getLength(), col_seq.getLength(), str(alignlib_lite.py_AlignmentFormatEmissions(map_row2col)))) if options.loglevel >= 2: options.stdlog.write( str( alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) + "\n") # check for incomplete codons if options.require_codons: naligned = map_row2col.getNumAligned() # turned off, while fixing alignlib_lite if naligned % 3 != 0: options.stdlog.write("# %s\n" % str(map_row2col)) options.stdlog.write("# %s\n" % str(link)) options.stdlog.write("# %s\n" % str(map_old2new[link.mQueryToken])) options.stdlog.write("# %s\n" % str(map_old2new[link.mSbjctToken])) options.stdlog.write("#\n%s\n" % alignlib_lite.py_AlignmentFormatExplicit( map_row2col, row_seq, col_seq)) raise ValueError( "incomplete codons %i in pair %s - %s" % (naligned, link.mQueryToken, link.mSbjctToken)) # if so desired, write on a per exon level: if exons: if link.mQueryToken not in exons: raise IndexError("%s not found in exons" % (link.mQueryToken)) if link.mSbjctToken not in exons: raise IndexError("%s not found in exons" % (link.mSbjctToken)) exons1 = exons[link.mQueryToken] exons2 = exons[link.mSbjctToken] # Get overlapping segments segments = Exons.MatchExons(map_row2col, exons1, exons2) for a, b in segments: tmp1_map_row2col.clear() # make sure you got codon boundaries. Note that frameshifts # in previous exons will cause the codons to start at positions # different from mod 3. The problem is that I don't know where # the frameshifts occur exactly. The exon boundaries are given # with respect to the cds, which include the frame shifts. # Unfortunately, phase information seems to be incomplete in # the input files. from1, to1 = GetAdjustedBoundaries(a, exons1) from2, to2 = GetAdjustedBoundaries(b, exons2) alignlib_lite.py_copyAlignment(tmp1_map_row2col, map_row2col, from1 + 1, to1, from2 + 1, to2) mode = Write(tmp1_map_row2col, row_seq, col_seq, link, no_gaps=options.no_gaps, no_identical=options.no_identical, min_length=options.min_length, suffix1="_%s" % str(a), suffix2="_%s" % str(b), outfile=outfile, pair_filter=map_pair2hid, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 else: mode = Write(map_row2col, row_seq, col_seq, link, min_length=options.min_length, no_gaps=options.no_gaps, no_identical=options.no_identical, outfile=outfile, pair_filter=map_pair2hids, format=options.format) if mode not in counts: counts[mode] = 0 counts[mode] += 1 noutput += 1 if outfile: outfile.close() if options.loglevel >= 1: options.stdlog.write("# %s\n" % ", ".join( map(lambda x, y: "%s=%i" % (x, y), counts.keys(), counts.values()))) options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
if line[0] == "#": continue if line[0] == ">": continue a, b = line[:-1].split("\t")[:2] if b not in components: components[b] = [] components[b].append(a) if param_loglevel >= 1: print "# read %i components." % len(components) else: components = {'all': all_identifiers} if param_filename_exons: exons = Exons.ReadExonBoundaries( open(param_filename_exons, "r"), filter=all_mali) if param_loglevel >= 2: print "# read %i exons." % len(exons) else: exons = {} print "# PREFIX\tsummary\tNSEQUENCES\tNASSIGNED\tNCLUSTERS\tNASSIGNED\tUNASSIGNED" print "# PREFIX\tcluster\tNMEMBERS\tMEMBERS" print "# PREFIX\tfragments\tNFRAGMENTS\tFRAGMENTS" print "# PREFIX\tpide\tNPAIRS\tNAMIN\tNAMAX\tNAMEAN\tNAMEDIAN\tNASTDDEV\tAAMIN\tAAMAX\tAAMEAN\tAAMEDIAN\tAASTDDEV" print string.join(("# PREFIX", "codons", "NCLEAN", "NNOSTOPS", "ALIGNED_MIN", "ALIGNED_MAX", "ALIGNED_MEAN", "ALIGNED_MEDIAN", "ALIGNED_STDDEV", "CODONS_MIN", "CODONS_MAX", "CODONS_MEAN", "CODONS_MEDIAN", "CODONS_STDDEV",
elif o == "--report-step": param_report_step = int(a) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) print E.GetHeader() print E.GetParams() sys.stdout.flush() if param_loglevel >= 1: print "# reading exon boundaries." sys.stdout.flush() cds = Exons.ReadExonBoundaries(open(param_filename_cds, "r")) if param_loglevel >= 1: print "# read %i cds" % (len(cds)) sys.stdout.flush() ninput, npairs, nskipped = 0, 0, 0 for line in sys.stdin: if line[0] == "#": continue if line[0] == ">": print line[:-1] continue ninput += 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/compare_predictions2exons.py 2011 2008-07-04 10:40:51Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-b", "--boundaries", dest="filename_boundaries", type="string", help="filename with exon boundaries.") parser.add_option("-e", "--exons", dest="filename_exons", type="string", help="filename with exons (output).") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide sequences.") parser.add_option( "-w", "--write-notfound", dest="write_notfound", action="store_true", help="print exons for predictions not found in reference.") parser.add_option("-q", "--quality-pide", dest="quality_threshold_pide", type="int", help="quality threshold (pide) for exons.") parser.set_defaults( genome_file="genome", filename_boundaries=None, filename_exons=None, filename_peptides=None, quality_threshold_pide=0, write_notfound=False, ## allowed number of nucleotides for exon boundaries to ## be considered equivalent. slipping_exon_boundary=9, ## stop codons to search for stop_codons=("TAG", "TAA", "TGA"), ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) reference_exon_boundaries = {} if options.filename_boundaries: reference_exon_boundaries = Exons.ReadExonBoundaries(open( options.filename_boundaries, "r"), do_invert=1, remove_utr=1) E.info("read exon boundaries for %i queries" % len(reference_exon_boundaries)) if options.filename_exons: outfile_exons = open(options.filename_exons, "w") outfile_exons.write("%s\n" % "\t".join( ("prediction_id", "exon_id", "exon_from", "exon_to", "exon_frame", "reference_id", "reference_from", "reference_to", "reference_phase", "pidentity", "psimilarity", "nframeshifts", "ngaps", "nstopcodons", "is_ok", "genome_exon_from", "genome_exon_to"))) else: outfile_exons = None if options.filename_peptides: peptide_sequences = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) E.info("read peptide sequences for %i queries" % len(peptide_sequences)) else: peptide_sequences = {} entry = PredictionParser.PredictionParserEntry() last_filename_genome = None nfound, nmissed_exons, nmissed_length = 0, 0, 0 nempty_alignments = 0 fasta = IndexedFasta.IndexedFasta(options.genome_file) options.stdout.write("%s\n" % "\t".join( ("prediction_id", "number", "dubious_exons", "boundaries_sum", "boundaries_max", "identical_exons", "inserted_exons", "deleted_exons", "inserted_introns", "deleted_introns", "truncated_Nterminus", "truncated_Cterminus", "deleted_Nexons", "deleted_Cexons", "inserted_Nexons", "inserted_Cexons"))) for line in sys.stdin: if line[0] == "#": continue try: entry.Read(line) except ValueError, msg: print "# parsing failed with msg %s in line %s" % (msg, line[:-1]) sys.exit(1) exons = Genomics.Alignment2ExonBoundaries( entry.mMapPeptide2Genome, query_from=entry.mQueryFrom, sbjct_from=entry.mSbjctGenomeFrom, add_stop_codon=0) if exons[-1][4] != entry.mSbjctGenomeTo: print "# WARNING: discrepancy in exon calculation!!!" for e in exons: print "#", str(e) print "#", str(entry) if options.loglevel >= 5: for e in exons: print "#", str(e) genomic_fragment = fasta.getSequence(entry.mSbjctToken, entry.mSbjctStrand, entry.mSbjctGenomeFrom, entry.mSbjctGenomeTo) skip = False if peptide_sequences.has_key(entry.mQueryToken): query_sequence = alignlib_lite.makeSequence( peptide_sequences[entry.mQueryToken]) sbjct_sequence = alignlib_lite.makeSequence(entry.mTranslation) percent_similarity, percent_identity = 0, 0 if query_sequence.getLength( ) < entry.mMapPeptide2Translation.getRowTo(): print "# WARNING: query sequence %s is too short: %i %i" % ( entry.mQueryToken, query_sequence.getLength(), entry.mMapPeptide2Translation.getRowTo()) sys.stdout.flush() nmissed_length += 1 skip = True elif sbjct_sequence.getLength( ) < entry.mMapPeptide2Translation.getColTo(): print "# WARNING: sbjct sequence %s is too short: %i %i" % ( entry.mSbjctToken, sbjct_sequence.getLength(), entry.mMapPeptide2Translation.getColTo()) sys.stdout.flush() nmissed_length += 1 skip = True else: alignlib_lite.rescoreAlignment( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence, alignlib_lite.makeScorer(query_sequence, sbjct_sequence)) percent_identity = alignlib_lite.calculatePercentIdentity( entry.mMapPeptide2Translation, query_sequence, sbjct_sequence) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( entry.mMapPeptide2Translation) * 100 E.debug( "prediction %s: percent identity/similarity: before=%5.2f/%5.2f, realigned=%5.2f/%5.2f" % (str(entry.mPredictionId), entry.mPercentSimilarity, entry.mPercentIdentity, percent_similarity, percent_identity)) else: query_sequence = None sbjct_sequence = None # default values exons_num_exons = "na" exons_boundaries_sum = "na" exons_boundaries_max = "na" dubious_exons = "na" ndeleted_exons, ninserted_exons, ndeleted_introns, ninserted_introns, nidentical_exons = 0, 0, 0, 0, 0 truncated_Nterminal_exon, truncated_Cterminal_exon = 0, 0 ndeleted_Nexons, ndeleted_Cexons = 0, 0 ninserted_Nexons, ninserted_Cexons = 0, 0 exons_offset = exons[0][3] if not reference_exon_boundaries.has_key(entry.mQueryToken): print "# WARNING: sequence %s has no exon boundaries" % ( entry.mQueryToken) sys.stdout.flush() nmissed_exons += 1 skip = True if not skip: nfound += 1 ref_exons = reference_exon_boundaries[entry.mQueryToken] ref_exons_offset = ref_exons[0].mGenomeFrom exons_num_exons = len(ref_exons) - len(exons) exons_boundaries_sum = 0 exons_phase = 0 exons_boundaries_max = 0 dubious_exons = 0 inserted_exons = 0 temp_inserted_exons = 0 if options.loglevel >= 3: for e in exons: options.stdlog.write("# %s\n" % str(e)) for e in ref_exons: options.stdlog.write("# %s\n" % str(e)) min_pide = entry.mPercentIdentity * options.quality_threshold_pide / 100 in_sync = 0 e, r = 0, 0 while e < len(exons) and r < len(ref_exons): this_e, this_r = e + 1, r + 1 percent_identity = 0 percent_similarity = 0 is_good_exon = 0 if options.loglevel >= 4: options.stdlog.write("# current exons: %i and %i\n" % (e, r)) sys.stdout.flush() exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exons[ e][0:6] ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = ( ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset ## get percent identity for exon exon_percent_identity = 0 exon_percent_similarity = 0 if query_sequence and sbjct_sequence: tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = exon_from / 3 xquery_to = exon_to / 3 alignlib_lite.copyAlignment(tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to) if tmp_ali.getLength() == 0: options.stdlog.write( "# WARNING: empty alignment %s\n" % str( (ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) nempty_alignments += 1 else: if options.loglevel >= 5: options.stdlog.write("# %s\n" % str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence))) exon_percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence) * 100 exon_percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali) * 100 if exon_percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 if e < len(exons) - 1: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = exons[e + 1][0:6] else: (next_exon_from, next_exon_to, next_exon_phase, next_exon_genome_from, next_exon_genome_to, next_exon_ali) = 0, 0, 0, 0, 0, [] if r < len(ref_exons) - 1: next_ref_from, next_ref_to, next_ref_phase = ( ref_exons[r + 1].mPeptideFrom, ref_exons[r + 1].mPeptideTo, ref_exons[r + 1].frame) else: next_ref_from, next_ref_to, next_ref_phase = 0, 0, 0 if options.loglevel >= 2: options.stdlog.write("# %s\n" % "\t".join( map(str, (entry.mQueryToken, exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, ref_from, ref_to, ref_phase)))) sys.stdout.flush() # beware of small exons. # if less than options.slipping_exon_boundary: boundary is 0 # check if end is more than options.splipping_exon_boundary apart as well. if exon_to - exon_from <= options.slipping_exon_boundary or \ ref_to - ref_from <= options.slipping_exon_boundary: boundary = 0 else: boundary = options.slipping_exon_boundary if ref_to <= exon_from + boundary and \ ref_to <= exon_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if e == 0: ndeleted_Nexons += 1 else: ndeleted_exons += 1 r += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = 0, 0, 0, 0, 0 overlap = 0 elif exon_to <= ref_from + boundary and \ exon_to <= ref_to - options.slipping_exon_boundary: ## no overlap is_good_exon = 0 if r == 0: ninserted_Nexons += 1 else: ninserted_exons += 1 e += 1 ref_from, ref_to, ref_phase = 0, 0, 0 overlap = 0 else: ## overlap overlap = 1 dfrom = int(math.fabs(exon_from - ref_from)) dto = int(math.fabs(exon_to - ref_to)) ## get percent identity for overlapping fragment if query_sequence and sbjct_sequence: ## this the problem tmp_ali = alignlib_lite.makeAlignmentVector() xquery_from = max(ref_from / 3, exon_from / 3) xquery_to = min(ref_to / 3, exon_to / 3) alignlib_lite.copyAlignment( tmp_ali, entry.mMapPeptide2Translation, xquery_from, xquery_to) if tmp_ali.getLength() == 0: options.stdlog.write( "# warning: empty alignment %s\n" % str( (ref_from, exon_from, ref_to, exon_to, xquery_from, xquery_to))) percent_identity = 0 percent_similarity = 0 else: if options.loglevel >= 5: print str( alignlib_lite.AlignmentFormatExplicit( tmp_ali, query_sequence, sbjct_sequence)) percent_identity = alignlib_lite.calculatePercentIdentity( tmp_ali, query_sequence, sbjct_sequence) * 100 percent_similarity = alignlib_lite.calculatePercentSimilarity( tmp_ali) * 100 if percent_identity >= min_pide: is_good_exon = 1 else: is_good_exon = 0 dubious_exons += 1 ## adjust regions for terminal exons if e == 0 and r == 0 and dfrom <= (entry.mQueryFrom - 1) * 3 and dfrom > 0: if is_good_exon: truncated_Nterminal_exon = dfrom dfrom = 0 ## truncated terminal exons if e == len(exons) - 1 and r == len( ref_exons) - 1 and dto <= ( entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: if is_good_exon: truncated_Cterminal_exon = dto dto = 0 ## do not count deviations for terminal query exons if e == 0 and dfrom <= entry.mQueryFrom * 3 and dfrom > 0: dfrom = 0 if e == len(exons) - 1 and dto <= ( entry.mQueryLength - entry.mQueryTo) * 3 and dto > 0: dto = 0 ## permit difference of one codon (assumed to be stop) if e == len(exons) - 1 and r == len( ref_exons) - 1 and dto == 3: dto = 0 ## deal with different boundary conditions: if dfrom == 0 and dto == 0: if is_good_exon: nidentical_exons += 1 e += 1 r += 1 ## next exon within this ref_exon elif exon_to < ref_to and next_exon_to and next_exon_to <= ref_to + options.slipping_exon_boundary: if is_good_exon: ninserted_introns += 1 e += 1 in_sync = 1 dto = 0 ## next ref_exon within this exon elif ref_to < exon_to and next_ref_to and next_ref_to <= exon_to + options.slipping_exon_boundary: if is_good_exon: ndeleted_introns += 1 r += 1 in_sync = 1 dto = 0 else: e += 1 r += 1 if in_sync: dfrom = 0 if is_good_exon: exons_boundaries_sum += dfrom + dto exons_boundaries_max = max(dfrom, exons_boundaries_max) exons_boundaries_max = max(dto, exons_boundaries_max) ########################################################### ## count inserted/deleted introns and misplaced boundaries ## ## if exon and next_exon in ref_exon: inserted intron ## if ref_exon and next_ref_exon in exon: deleted intron if outfile_exons: if genomic_fragment and exon_genome_to: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment, border_stop_codon=0) else: nintrons, nframeshifts, ngaps, nsplits, nstopcodons = 0, 0, 0, 0, 0 if exon_to == 0: this_e = 0 if ref_to == 0: this_r = 0 outfile_exons.write( string.join( map(str, ( entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, this_r, ref_from, ref_to, ref_phase, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, is_good_exon, exon_genome_from, exon_genome_to, )), "\t") + "\n") while e < len(exons): exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to = exons[ e][0:5] e += 1 ninserted_Cexons += 1 if outfile_exons: outfile_exons.write( string.join( map(str, ( entry.mPredictionId, e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") while r < len(ref_exons): ref_from, ref_to, ref_phase, ref_genome_from, ref_genome_to = ( ref_exons[r].mPeptideFrom, ref_exons[r].mPeptideTo, ref_exons[r].frame, ref_exons[r].mGenomeFrom, ref_exons[r].mGenomeTo) ndeleted_Cexons += 1 ref_genome_from -= ref_exons_offset ref_genome_to -= ref_exons_offset r += 1 if outfile_exons: outfile_exons.write( string.join( map(str, ( entry.mPredictionId, 0, 0, 0, 0, r, ref_from, ref_to, ref_phase, 0, 0, 0, 0, 0, 0, 0, 0, )), "\t") + "\n") else: if options.write_notfound: this_e = 0 ## use prediction's identity/similarity for exons. ## This will still then flag stop-codons in later analysis percent_identity = entry.mPercentIdentity percent_similarity = entry.mPercentSimilarity for exon in exons: this_e += 1 exon_from, exon_to, exon_phase, exon_genome_from, exon_genome_to, exon_ali = exon[ 0:6] if genomic_fragment: nintrons, nframeshifts, ngaps, nsplits, nstopcodons, disruptions = Genomics.CountGeneFeatures( exon_genome_from - entry.mSbjctGenomeFrom, exon_ali, genomic_fragment) outfile_exons.write( string.join( map(str, ( entry.mPredictionId, this_e, exon_from, exon_to, exon_phase, 0, 0, 0, 0, percent_identity, percent_similarity, nframeshifts, ngaps, nstopcodons, 1, exon_genome_from, exon_genome_to, )), "\t") + "\n") options.stdout.write("\t".join( map(str, (entry.mPredictionId, exons_num_exons, dubious_exons, exons_boundaries_sum, exons_boundaries_max, nidentical_exons, ninserted_exons, ndeleted_exons, ninserted_introns, ndeleted_introns, truncated_Nterminal_exon, truncated_Cterminal_exon, ndeleted_Nexons, ndeleted_Cexons, ninserted_Nexons, ninserted_Cexons))) + "\n")
def ReadTranscriptsAndCds(transcript_ids1, transcript_ids2): if param_loglevel >= 1: print "# reading %i left and %i right transcripts" % ( len(transcript_ids1), len(transcript_ids2)) sys.stdout.flush() if param_loglevel >= 1: print "# reading exon boundaries." sys.stdout.flush() cds1 = Exons.ReadExonBoundaries(open(param_filename_cds1, "r"), filter=transcript_ids1, reset=True) cds2 = Exons.ReadExonBoundaries(open(param_filename_cds2, "r"), filter=transcript_ids2, reset=True) if param_loglevel >= 1: print "# read %i left and %i right cds" % (len(cds1), len(cds2)) sys.stdout.flush() if param_loglevel >= 2: if len(cds1) != len(transcript_ids1): print "# missed in left: %s" % ":".join( set(transcript_ids1.keys()).difference(cds1.keys())) if len(cds2) != len(transcript_ids2): print "# missed in right: %s" % ":".join( set(transcript_ids2.keys()).difference(cds2.keys())) if param_loglevel >= 1: print "# reading genomic sequences." sys.stdout.flush() transcripts1 = {} if param_filename_transcripts1: if param_mode_genome1 == "indexed": transcripts1 = Genomics.ParseFasta2HashFromIndex( param_filename_transcripts1, filter=transcript_ids1) else: transcripts1 = Genomics.ReadGenomicSequences( open(param_filename_transcripts1, "r"), do_reverse=0, filter=transcript_ids1, mask=param_mask) transcripts2 = {} if param_filename_transcripts2: if param_mode_genome2 == "indexed": transcripts2 = Genomics.ParseFasta2HashFromIndex( param_filename_transcripts2, filter=transcript_ids2) else: transcripts2 = Genomics.ReadGenomicSequences( open(param_filename_transcripts2, "r"), do_reverse=0, filter=transcript_ids2, mask=param_mask) if param_loglevel >= 1: print "# read %i left and %i right transcript sequences" % ( len(transcripts1), len(transcripts2)) sys.stdout.flush() return transcripts1, transcripts2, cds1, cds2
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gtf2exons.py 2781 2009-09-10 11:33:14Z andreas $", usage = globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option("--coordinate-format", dest="coordinate_format", type="string", help="input type of coordinates." ) parser.add_option("--forward-coordinates", dest="forward_coordinates", action="store_true", help="output forward coordinates." ) parser.add_option("-e", "--extract-id", dest="extract_id", type="string", help="""regular expression to extract id from id column, e.g. 'transcript_id "(\S+)"'.""" ) parser.set_defaults( coordinate_format = "zero-forward", forward_coordinates = False, genome_file = None, extract_id = None ) (options, args) = E.Start( parser ) if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() else: contig_sizes = {} if options.extract_id: extract_id = re.compile( options.extract_id ) else: extract_id = None converter = IndexedFasta.getConverter( options.coordinate_format ) exons = Exons.ReadExonBoundaries( sys.stdin, contig_sizes = contig_sizes, converter = converter, do_invert = True, format = "gtf", gtf_extract_id = extract_id ) ntranscripts, nexons, nerrors = 0, 0, 0 for id, ee in exons.items(): ntranscripts += 1 has_error = False for e in ee: if options.forward_coordinates and e.mSbjctToken in contig_sizes and \ e.mSbjctStrand == "-": l = contig_sizes[e.mSbjctToken] e.mGenomeFrom, e.mGenomeTo = l - e.mGenomeTo, l - e.mGenomeFrom if e.mGenomeFrom < 0: has_error = True if options.loglevel >= 1: options.stderr.write( "# Error: %s\n" % str(e) ) break options.stdout.write( str(e) + "\n" ) nexons += 1 if has_error: nerrors += 1 continue if options.loglevel >= 1: options.stdlog.write("# ntranscripts=%i, nexons=%i, nerrors=%i\n" % (ntranscripts, nexons, nerrors)) E.Stop()
continue else: nnotfound += 1 new_results.append(entry) noutput += 1 results = new_results if results: options.stdout.write(str(results) + "\n") elif options.output_format == "exontable": if options.format == "exons": exons = Exons.ReadExonBoundaries(sys.stdin, contig_sizes=contig_sizes, delete_missing=True) else: raise "unknown format." for k in exons.keys(): ee = exons[k] id = 0 for e in ee: id += 1 print "\t".join( map(str, (e.mQueryToken, id, e.mPeptideFrom, e.mPeptideTo, e.frame, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, e.mGenomeFrom, e.mGenomeTo)))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser.add_option("-q", "--quality", dest="quality", type="string", help="quality categories to take into account.") parser.add_option("-f", "--format=", dest="format", type="string", help="input format [exons|gff|table]") parser.add_option("-e", "--exons=", dest="tablename_exons", type="string", help="table name with exons.") parser.add_option("-p", "--predictions=", dest="tablename_predictions", type="string", help="table name with predictions.") parser.add_option("-n", "--non-redundant", dest="non_redundant", action="store_true", help="only non-redundant predictions.") parser.add_option("-s", "--schema", dest="schema", type="string", help="schema to use.") parser.set_defaults( fields=[ "Id", "NumExons", "GeneLength", "MinExonLength", "MaxExonLength", "MinIntronLength", "MaxIntronLength" ], tablename_exons="exons", tablename_predictions="predictions", quality=None, non_redundant=False, schema=None, tablename_redundant="redundant", tablename_quality="quality", format="exons", ) (options, args) = E.Start(parser, add_csv_options=True, add_psql_options=True) if options.quality: options.quality = options.quality.split(",") if options.format == "table": dbhandle = pgdb.connect(options.psql_connection) exons = Exons.GetExonBoundariesFromTable( dbhandle, options.tablename_predictions, options.tablename_exons, non_redundant_filter=options.non_redundant, quality_filter=options.quality, table_name_quality=options.tablename_quality, table_name_redundant=options.tablename_redundant, schema=options.schema) else: exons = Exons.ReadExonBoundaries(sys.stdin) stats = Exons.CalculateStats(exons) print "\t".join(options.fields) writer = csv.DictWriter(sys.stdout, options.fields, dialect=options.csv_dialect, lineterminator=options.csv_lineterminator, extrasaction='ignore') for k, v in stats.items(): v["Id"] = k writer.writerow(v) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/prune_multiple_alignment.py 2654 2009-05-06 13:51:22Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--master", dest="master", type="string", help="master sequence.") parser.add_option("-p", "--master-pattern", dest="master_pattern", type="string", help="master pattern.") parser.add_option("--master-species", dest="master_species", type="string", help="species to use as master sequences.") parser.add_option("-t", "--translate", dest="filename_translation", type="string", help="filename on where to store translated sequences.") parser.add_option("-e", "--exons", dest="filename_exons", type="string", help="filename on where to exon information.") parser.add_option("-c", "--mark-codons", dest="mark_codons", action="store_true", help="mark codons.") parser.add_option( "-i", "--ignore-case", dest="ignore_case", action="store_true", help="ignore case (otherwise: lowercase are unaligned chars).") parser.add_option("--remove-stops", dest="remove_stops", action="store_true", help="remove stop codons.") parser.add_option("--mask-stops", dest="mask_stops", action="store_true", help="mask stop codons.") parser.add_option("--mask-char", dest="mask_char", type="string", help="masking character to use.") parser.add_option("-f", "--remove-frameshifts", dest="remove_frameshifts", action="store_true", help="remove columns corresponding to frameshifts.") parser.add_option( "--mask-master", dest="mask_master", action="store_true", help= "columns in master to be removed are masked to keep residue numbering." ) parser.add_option( "-s", "--split-exons", dest="split_exons", action="store_true", help="split columns aligned to different exons in the same gene.") parser.add_option("-a", "--target", dest="target", type="choice", choices=("paml", ), help="perform cleaning up for certain targets.") parser.set_defaults( gap_char="-", mask_char="n", gap_chars="-.", separator="|", master=None, master_species=None, filename_translation=None, filename_exons=None, master_pattern=None, remove_stops=False, mark_codons=False, mask_unaligned=False, split_exons=False, remove_frameshifts=False, min_segment_length=5, ignore_case=False, mask_stops=False, target=None, mask_master=False, ) (options, args) = E.Start(parser) if options.target == "paml": options.mask_stops = True options.mask_char = "n" options.remove_frameshifts = True if options.loglevel >= 1: options.stdlog.write( "# setting output to paml : removing frameshifts, masking stops with '%s'.\n" % (options.mask_char)) ## 1. read multiple alignment in fasta format mali = Mali.Mali() mali.readFromFile(sys.stdin) if options.loglevel >= 1: options.stdlog.write("# read mali with %i entries.\n" % len(mali)) if len(mali) == 0: raise "empty multiple alignment" identifiers = mali.getIdentifiers() masters = [] if options.master: masters = options.master.split(",") elif options.master_pattern: for id in identifiers: if re.search(options.master_pattern, id): masters.append(id) elif options.master_species: for id in identifiers: if options.master_species == id.split(options.separator)[0]: masters.append(id) else: masters.append(identifiers[0]) if options.loglevel >= 2: options.stdlog.write("# master sequences are: %s\n" % str(masters)) options.stdlog.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"), filter=set(identifiers), from_zero=True) if options.loglevel >= 2: options.stdlog.write("# read exons %i sequences.\n" % len(exons)) else: exons = {} ################################################################################# ################################################################################# ################################################################################# ## translate characters to upper/lower case according to exon info. ################################################################################# if exons: for id in identifiers: if id in exons: mali.getSequence(id).mString = AddExonInformation( mali[id], exons[id], mask_char=options.mask_char) elif options.ignore_case: ## convert all to uppercase mali.upper() ################################################################################# ################################################################################# ################################################################################# ## untangle misaligned exons ################################################################################# if exons and options.split_exons: ## first split with masters if len(masters) > 0: SplitExons(mali, exons, masters=masters, options=options) if options.loglevel >= 4: mali.writeToFile(open("log_mali1", "w"), format="fasta") SplitExons(mali, exons, options) ################################################################################# ################################################################################# ################################################################################# ## remove frameshifts ################################################################################# if options.remove_frameshifts: out_of_frame_columns = [] if len(masters) == 1: frame_columns = GetFrameColumns(mali, masters[0], gap_chars=options.gap_chars) else: columns = [] for id in masters: columns += GetFrameColumns(mali, id, gap_chars=options.gap_chars) if len(columns) == 0: columns += GetFrameColumns(mali, identifiers[0], gap_chars=options.gap_chars) # sort all columns by tuple. The "shortest" codon will be first: (1,2,3) before (1,2,100), # and (1,2,100) before (1,3,4). columns.sort(lambda x, y: cmp((x[0], x[2]), (y[0], y[2]))) # select codons frame_columns = [] last_codon = columns[0] for codon in columns[1:]: # skip identical codons if codon == last_codon: continue # take first (shortest) codon in case of identical first residue if codon[0] == last_codon[0]: continue # if not overlapping, keep if codon[0] > last_codon[2]: frame_columns.append(last_codon) else: out_of_frame_columns += last_codon # if overlapping, but out of register: skip last_codon = codon frame_columns.append(last_codon) # build set of skipped columns frame_set = set() for column in frame_columns: for c in column: frame_set.add(c) # columns that contain a master sequence that is out of # frame out_of_frame_set = set(out_of_frame_columns) out_of_frame_set = out_of_frame_set.difference(frame_set) if options.loglevel >= 1: options.stdlog.write("# found %i/%i columns in frame\n" % (len(frame_columns) * 3, mali.getWidth())) if options.loglevel >= 5: options.stdlog.write("# frame columns: %i\n" % (len(frame_columns))) x = 0 for column in frame_columns: options.stdlog.write("# %i\t%s\n" % (x, ",".join(map(str, column)))) x += 1 if options.loglevel >= 5: options.stdlog.write( "# Out-of frame columns with residue of masters: %i\n" % (len(out_of_frame_set))) options.stdlog.write("# %s" % ",".join(map(str, out_of_frame_columns))) mask_chars = (string.upper(options.mask_char), string.lower(options.mask_char)) to_delete = [] ignore_case = exons or options.ignore_case for id in identifiers: ngaps, nmasked = 0, 0 sequence = mali.getSequence(id).mString if options.loglevel >= 7: options.stdlog.write( "# processing sequence %s of length %i with gaps\n" % (id, len(sequence))) ## treat masters differently if they are only to be masked, not ## pruned. ## simple mask all characters that are to skipped fragments = [] nstops, ncodons, naligned = 0, 0, 0 codon = [] chars = [] is_master = id in masters for x in range(len(sequence)): c = sequence[x] ## delete columns that do not align to ## a master. if x not in frame_set and x not in out_of_frame_set: continue chars.append(c) if c not in options.gap_chars: codon.append(c) if len(codon) % 3 == 0: codon = "".join(codon) codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options) if codon_is_aligned: naligned += 1 to_mask = False if codon_is_all_gaps: ngaps += len(chars) elif codon_is_ok: ncodons += 1 if string.upper(codon) in ("TAG", "TAA", "TGA"): nstops += 1 to_mask = True else: to_mask = True nmasked += 1 if to_mask: for i in range(len(chars)): if chars[i] not in options.gap_chars: chars[i] = options.mask_char fragments.append("".join(chars)) chars = [] codon = [] ## mask incomplete codons at the end if chars: for i in range(len(chars)): if chars[i] not in options.gap_chars: chars[i] = options.mask_char fragments.append("".join(chars)) ## else: ## for a,b,c in frame_columns: ## codon = sequence[a] + sequence[b] + sequence[c] ## codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options ) ## if codon_is_aligned: naligned += 1 ## if codon_is_all_gaps: ## fragments.append( options.gap_char * 3 ) ## ngaps += 1 ## elif codon_is_ok: ## ncodons += 1 ## if string.upper(codon) in ("TAG", "TAA", "TGA"): ## if options.remove_stops: ## fragments.append( options.gap_char * 3 ) ## elif options.mask_stops: ## fragments.append( options.mask_char * 3 ) ## else: ## fragments.append( codon ) ## nstops += 1 ## else: ## fragments.append( codon ) ## else: ## fragments.append( options.gap_char * 3 ) ## nmasked += 1 ## if options.loglevel >= 7: ## options.stdlog.write("# %s: %i,%i,%i: codon=%s ok=%s is_aligned=%s\n" % (id, ## a,b,c, ## codon, ## str(codon_is_ok), ## str(codon_is_aligned) )) s = string.join(fragments, "") if options.loglevel >= 1: options.stdlog.write( "# sequence: %s\tpositions: %i\taligned:%i\tcodons: %i\t stops: %i\tgaps: %i\tnmasked: %i\n" % (id, len(fragments), naligned, ncodons, nstops, ngaps, nmasked)) options.stdlog.flush() ## postpone deletion in order to not ## confuse the iteration of ids if naligned == 0: options.stdlog.write( "# sequence: %s removed because there are no aligned nucleotides.\n" % id) to_delete.append(id) elif ncodons == 0: options.stdlog.write( "# sequence: %s removed because there are no aligned codons.\n" % id) to_delete.append(id) else: mali.setSequence(id, string.join(fragments, "")) for id in to_delete: del mali[id] for id in identifiers: if options.mark_codons: a = mali[id] f = lambda x: a[x:x + 3] s = string.join([f(x) for x in range(0, len(a), 3)], " ") else: s = mali[id] options.stdout.write(">%s\n%s\n" % (id, s)) if options.filename_translation: outfile = open(options.filename_translation, "w") for id in mali.keys(): outfile.write(">%s\n%s\n" % (id, Genomics.TranslateDNA2Protein(mali[id]))) outfile.close() E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/select_transcripts.py 2263 2008-11-17 16:36:29Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-o", "--overlap", dest="overlap_residues", type="int", help="overlap residues.") parser.add_option( "-t", "--filter-tokens", dest="filename_filter_tokens", type="string", help="filename to filter tokens." ) parser.add_option( "-i", "--exon-identity", dest="exon_identity", action="store_true", help="exon identity." ) parser.add_option( "--exons", dest="filename_exons", type="string", help="filename with exon information." ) parser.add_option( "-m", "--output-members", dest="filename_members", type="string", help="output filename with members." ) parser.add_option( "--overlap-id", dest="overlap_id", action="store_true", help="overlap id." ) parser.add_option( "-s", "--remove-spanning", dest="remove_spanning_predictions", action="store_true", help="remove spanning predictions." ) parser.add_option( "-c", "--remove-complement", dest="remove_complementary_predictions", action="store_true", help="remove complementary predictions." ) parser.add_option( "--remove-exon-swoppers", dest="remove_exon_swoppers", action="store_true", help="remove exon swoppers." ) parser.add_option( "--remove-gene-spanners", dest="remove_gene_spanners", action="store_true", help="remove gene spanners." ) parser.add_option( "--remove-suboptimal", dest="remove_suboptimal", action="store_true", help="remove suboptimal predictions." ) parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide information." ) parser.add_option( "--extended-peptides", dest="filename_extended_peptides", type="string", help="filename with peptide information - after extension." ) parser.add_option( "--test", dest="test_nids", type="string", help="test nids." ) ## filter options parser.add_option( "--filter-transcripts", dest="filter_filename_transcripts", type="string", help="filename with transcripts that are used to filter." ) parser.add_option( "--filter-remove-spanning", dest="filter_remove_spanning", action="store_true", help="remove all transcripts that span the filter set." ) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "--discard-large-clusters", dest="discard_large_clusters", type="int", help="if set discard clusters bigger than this size (patch) [default=%default]." ) parser.set_defaults( filename_members = None, filename_peptides = None, filename_extended_peptides = None, filename_exons = None, quality_hierarchy = ("CG", "PG", "SG", "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK" ), ## Classes, where redundancy is removed by similarity. When exon structure ## is not conserved, I can't predict alternative splice variants, so remove ## the redundancy. quality_exclude_same = ( "UG", "UP", "UF", "BF", "UK" ), quality_genes = ("CG", "SG", "PG", "RG", "UG"), ## class that can be removed in spanning/complementary predictions quality_remove_dubious = ( "UG", "UP", "UF", "BF", "UK" ), ## class that is required for defining exon swopper event quality_remove_exon_swopper = ("CG", "PG"), ## class that will kept, in spite of being an exons swopper. quality_keep_exon_swopper = (), ## class that is required for removing gene spanners quality_remove_gene_spanners = ("CG"), ## class that will kept, in spite of being a gene spanner quality_keep_gene_spanners = (), ## class that is required for defining suboptimal matches quality_remove_suboptimal = ("CG", "PG" ), ## class that will be kept, in spite of being a suboptimal match quality_keep_suboptimal = (), ## gap penalties gop = -10.0, gep = -1.0, ## maximum number of gaps to allow in alignment max_gaps = 20, ## threshold of percent identity that allows to remove a prediction ## of a lower class. ## This allows for insertions/deletions min_identity = 98, ## threshold of percent identity that allows to remove a prediction ## of a non-gene by a gene min_identity_non_genes = 80, ## safety threshold: do not remove, if coverage of member is by x better ## than representative safety_pide = 10, safety_coverage = 10, overlap_id = False, remove_spanning_predictions = False, remove_exon_swoppers = False, remove_gene_spanners = False, remove_suboptimal = False, ## nids to use for testing test_nids = None, ## remove members with less than maximum coverage max_member_coverage = 90, ## maximum allowable exon slippage max_slippage = 9, ## minimum difference in identity for suboptimal predictions to be removed. suboptimal_min_identity_difference = 10, ## filter options filter_filename_transcripts = None, filter_remove_spanning = True, filter_remove_spanning_both_strands = True, genome_file = None, discard_large_clusters = None ) (options, args) = E.Start( parser, add_psql_options = True ) if options.test_nids: options.test_nids = options.test_nids.split(",") # list of eliminated predictions eliminated_predictions = {} if options.filename_members: outfile_members = open( options.filename_members, "w" ) else: outfile_members = sys.stdout ###################################################### ###################################################### ###################################################### # data ###################################################### data = [] class Entry: def __init__(self, gff): self.mPid = float(gff["pid"]) self.mQueryCoverage = float(gff["qcov"]) self.gene_id = gff['gene_id'] self.transcript_id = gff['transcript_id'] self.mExtendedStart = int( gff['xstart'] ) self.mExtendedEnd = int( gff['xend'] ) self.start = gff.start self.contig = gff.contig self.strand = gff.strand self.end = gff.end self.mQuality = gff['class'] for gff in GTF.iterator( sys.stdin ): data.append( Entry(gff) ) if options.loglevel >= 1: options.stdlog.write( "# read %i transcripts.\n" % len(data) ) options.stdlog.flush() ###################################################### ###################################################### ###################################################### # read peptide sequences ###################################################### if options.loglevel >= 1: options.stdlog.write( "# loading peptide databases ... " ) options.stdlog.flush() if options.filename_peptides: peptides = IndexedFasta.IndexedFasta( options.filename_peptides ) peptide_lengths = peptides.getContigSizes() else: peptide_lengths = {} peptides = {} ###################################################### ###################################################### ###################################################### # read extended peptide sequences ###################################################### if options.filename_extended_peptides: extended_peptides = IndexedFasta.IndexedFasta( options.filename_extended_peptides ) else: extended_peptides = {} if options.loglevel >= 1: options.stdlog.write( "finished\n" ) options.stdlog.flush() ###################################################### ###################################################### ###################################################### ## open genome file ###################################################### if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() else: contig_sizes = {} ###################################################### ###################################################### ###################################################### ## reading exons, clustering and formatting them. ###################################################### if options.filename_exons: if options.loglevel >= 1: options.stdlog.write( "# reading exon boundaries ... " ) options.stdlog.flush() ids = [ x.transcript_id for x in data ] exons = Exons.ReadExonBoundaries( open( options.filename_exons, "r"), contig_sizes = contig_sizes, filter = set(ids) ) if options.loglevel >= 1: options.stdlog.write( "done - read exons for %i transcripts\n" % (len(exons) )) if len(exons) == 0: raise ValueError("no exons found in table.") # flag terminal exons Exons.SetRankToPositionFlag( exons ) identity_map_cluster2transcripts, identity_map_transcript2cluster =\ Exons.ClusterByExonIdentity( exons, max_terminal_num_exons = 3, max_slippage= options.max_slippage, loglevel = options.loglevel ) overlap_map_cluster2transcripts, overlap_map_transcript2cluster =\ Exons.ClusterByExonOverlap( exons, min_overlap = 10, loglevel = options.loglevel ) else: exons = {} ###################################################### nrepresentatives, nmembers, neliminated = 0, 0, 0 eliminated_by_method = {} ###################################################### ###################################################### ###################################################### ## read filter transcripts and apply filters ###################################################### if options.filter_filename_transcripts: if options.loglevel >= 1: options.stdlog.write( "# reading exon boundaries for filter set ... " ) options.stdlog.flush() filter_exons = Exons.ReadExonBoundaries( open( options.filter_filename_transcripts, "r" ), delete_missing = True, contig_sizes = contig_sizes ) if options.loglevel >= 1: options.stdlog.write( "done - read exons for %i transcripts\n" % (len(filter_exons)) ) t = time.time() eliminated = FilterEliminateOverlappingTranscripts( exons, filter_exons, eliminated_predictions, contig_sizes, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i transcripts overlapping or spanning transcripts in %i seconds.\n" % (n, time.time()-t )) options.stdlog.flush() if options.remove_exon_swoppers and not exons: raise ValueError( "please specify exon table if using --remove-swoppers." ) if options.remove_gene_spanners and not exons: raise ValueError( "please specify exon table if using --remove-gene-spanners." ) ######################################################################################## ## remove predictions spanning other predictions but do not overlap with them on an exon level. if options.remove_gene_spanners and exons: if options.loglevel >= 1: options.stdlog.write( "# removing gene spanners\n" ) options.stdlog.flush() t = time.time() eliminated = EliminateGeneSpanners( data, eliminated_predictions, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i gene spanners in %i seconds\n" % (n, time.time()-t )) options.stdlog.flush() ######################################################################################## ## sort data by quality, length of prediction and coverage * pid if options.loglevel >= 1: options.stdlog.write( "# sorting data\n" ) options.stdlog.flush() map2pos = {} for x in range(len(options.quality_hierarchy)): map2pos[options.quality_hierarchy[x]] = x data.sort( key = lambda x: (map2pos[x.mQuality], len(extended_peptides[x.transcript_id]), x.mQueryCoverage * x.mPid ) ) # build map of prediction to quality map_prediction2data = {} for d in data: map_prediction2data[d.transcript_id] = d if options.loglevel >= 1: options.stdlog.write( "# sorting data finished\n" ) options.stdlog.flush() ######################################################################################## ## remove predictions joining two other complete non-overlapping predictions if options.remove_exon_swoppers and exons: if options.loglevel >= 1: options.stdlog.write( "# removing exon swoppers\n" ) options.stdlog.flush() eliminated = EliminateExonSwoppers( data, eliminated_predictions, identity_map_transcript2cluster, identity_map_cluster2transcripts, map_prediction2data, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i exon swoppers\n" % n ) options.stdlog.flush() ######################################################################################## ## remove suboptimal predictions if options.remove_suboptimal and exons: if options.loglevel >= 1: options.stdlog.write( "# removing suboptimal predictions\n" ) options.stdlog.flush() t = time.time() eliminated = EliminateSuboptimalPredictions( data, eliminated_predictions, overlap_map_transcript2cluster, overlap_map_cluster2transcripts, map_prediction2data, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i suboptimal predictions in %i seconds\n" % (n, time.time()-t) ) options.stdlog.flush() ######################################################################################## ## remove redundant predictions l = len(data) options.report_step = max(1, int(l / 100)) t2= time.time() last_quality = None qualities = [] options.stdout.write( "%s\t%s\n" % ("rep", "comment") ) for x in range(len(data)): if options.loglevel >= 1: if x % options.report_step == 0: options.stdlog.write( "# process: %i/%i = %i %%, %i/%i = %i %% in %i seconds\n" % \ (x+1, l, int(100 * (x+1) / l), len(eliminated_predictions), l, 100 * len(eliminated_predictions) / l, time.time() - t2 ) ) options.stdlog.flush() rep = data[x] rep_id, rep_quality = rep.transcript_id, rep.mQuality if rep_id in eliminated_predictions: continue if rep_quality != last_quality: if last_quality: qualities.append( last_quality ) last_quality = rep_quality if options.loglevel >= 2: options.stdlog.write( "# processing prediction %s|%s\n" % (rep_id, rep_quality) ) options.stdlog.flush() eliminated = [] if options.overlap_id: eliminated += EliminateRedundantEntriesByOverlap( rep, data[x+1:], eliminated_predictions, options, peptides, extended_peptides, filter_quality = qualities, this_quality = rep_quality ) else: eliminated += EliminateRedundantEntriesByRange( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality = qualities, this_quality = rep_quality ) options.stdout.write( "%s\t%i\n" % (rep_id, len(eliminated)) ) if outfile_members: outfile_members.write( "%s\t%s\tm\n" % (str(rep_id), str(rep_id))) nrepresentatives += 1 nmembers += PrintMembers( rep_id, outfile_members, eliminated, eliminated_by_method ) if outfile_members != sys.stdout: outfile_members.close() options.stdlog.write( "# representatives=%i, members=%i, eliminated=%i, total=%i\n" %\ (nrepresentatives, nmembers, neliminated, nrepresentatives+nmembers+neliminated ) ) options.stdlog.write( "# elimination by method:\n" ) for v,c in eliminated_by_method.items(): options.stdlog.write( "# method=%s, count=%i\n" % (v, c) ) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2cleaned_mali.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-m", "--genome-master", dest="genome_master", type="string", help="genome to use as master.") parser.add_option("-s", "--filename-removed", dest="filename_removed", type="string", help="output filename for deleted entries.") parser.add_option("-e", "--filename-exons", dest="filename_exons", type="string", help="filename on where to exon information.") parser.add_option("-u", "--filename-summary", dest="filename_summary", type="string", help="output filename of component summary.") parser.add_option("-c", "--filename-components", dest="filename_components", type="string", help="output filename for components.") parser.add_option( "--min-percent-overlap", dest="min_percent_overlap", type="float", help= "minimum percent overlap for splitting multiple alignment into components." ) parser.add_option("--max-percent-overlap", dest="max_percent_overlap", type="float", help="maximum percent overlap for split genes.") parser.add_option( "--min-genomic-distance", dest="min_genomic_distance", type="int", help= "minimum genomic distance for adjacent genes to be considered dodgy.") parser.add_option("-o", "--mode", dest="mode", type="choice", choices=("joining", "split"), help="""how to filter the alignment. joining: remove joining transcripts (spindly genes) split: remove split transcripts""") parser.add_option( "-g", "--gene-mode", dest="gene_mode", action="store_true", help= """the aligned sequences are genes. This forces the exon boundaries to collated by genes.""") parser.set_defaults( \ genome_master = None, filename_removed = None, filename_components = None, filename_summary = None, filename_exons = None, mode="joining", input_format = "fasta", output_format = "fasta", max_percent_overlap = 0, min_percent_overlap = 0, gene_mode = False, separator = "|") (options, args) = E.Start(parser) ############################################################### ############################################################### ############################################################### ## input ############################################################### mali = Mali.Mali() mali.readFromFile(sys.stdin, format=options.input_format) all_identifiers = mali.getIdentifiers() if options.filename_exons: ## read exon boundaries and keep forward coordinates if options.gene_mode: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"), from_zero=True) gene_exons = {} for id, ee in exons.items(): data = id.split(options.separator) new_id = options.separator.join((data[0], data[2])) if new_id not in gene_exons: gene_exons[new_id] = [] for e in ee: e.mQueryToken = new_id gene_exons[new_id] += ee for id, ee in gene_exons.items(): ee.sort(lambda x, y: cmp(x.mGenomeFrom, y.mGenomeFrom)) exons = gene_exons else: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"), filter=set(all_identifiers), from_zero=True) else: exons = {} ############################################################### ############################################################### ############################################################### ## collect all transcripts for a species together with their ## aligned length ############################################################### map_species2transcripts = {} for id in mali.getIdentifiers(): data = id.split(options.separator) species = data[0] if exons: l = exons[id][-1].mGenomeTo - exons[id][0].mGenomeFrom else: l = len(mali.getEntry(id).getSequence()) try: map_species2transcripts[species].append((l, id)) except KeyError: map_species2transcripts[species] = [(l, id)] if options.mode == "joining": mapped_transcripts = removeJoiningTranscripts(mali, exons, map_species2transcripts, options) elif options.mode == "split": mapped_transcripts = removeSplitTranscripts(mali, exons, map_species2transcripts, options) ############################################################### ############################################################### ############################################################### ## now build overlap graph of remaining sequences split multiple ## alignment in components. ## Compute reciprocal best match graph ############################################################### graph = networkx.Graph() removed_transcripts = set(map(lambda x: x[0], mapped_transcripts)) for t in all_identifiers: if t not in removed_transcripts: graph.add_node(t) for t1 in range(len(all_identifiers) - 1): transcript1 = all_identifiers[t1] if transcript1 in removed_transcripts: continue for t2 in range(t1 + 1, len(all_identifiers)): transcript2 = all_identifiers[t2] if transcript2 in removed_transcripts: continue overlap = getPercentOverlap(mali[transcript1], mali[transcript2]) if overlap > 5: graph.add_edge(transcript1, transcript2) ## compute components components = networkx.connected_components(graph) ############################################################### ############################################################### ############################################################### ## output ############################################################### if options.filename_components: n = 1 outfile = open(options.filename_components, "w") outfile.write("id\tcomponent\n") for component in components: for c in component: outfile.write("%s\t%i\n" % (c, n)) n += 1 outfile.close() if options.filename_removed and len(removed_transcripts) > 0: outfile = open(options.filename_removed, "w") outfile.write("removed\trepresentative\treason\n") for removed_transcript, rep_transcript, reason in mapped_transcripts: outfile.write("%s\t%s\t%s\n" % (removed_transcript, rep_transcript, reason)) outfile.close() if options.filename_summary: n = 1 outfile = open(options.filename_summary, "w") outfile.write("component\tsize\tnspecies\tnmaster\n") for component in components: species = map(lambda x: x.split(options.separator)[0], component) outfile.write( "%i\t%i\t%i\t%i\t%i\n" % (n, len(component), len(species), len(filter(lambda x: x == options.genome_master, species)))) n += 1 for transcript in removed_transcripts: mali.deleteEntry(transcript) new_identifiers = mali.getIdentifiers() mali.removeGaps(minimum_gaps=len(new_identifiers)) mali.writeToFile(options.stdout, format=options.output_format) if options.loglevel >= 1: options.stdlog.write( "# input=%i, output=%i, removed=%i, ncomponents=%i\n" % (len(all_identifiers), len(new_identifiers), len(removed_transcripts), len(components))) options.stdlog.write("# final component sizes: %s\n" % ",".join(map(lambda x: str(len(x)), components))) E.Stop()