Python Exons.ClusterByExonIdentity Exemples

Langage de programmation: Python

Espace de nommage/Pack: CGAT

Class/Type: Exons

Méthode/Fonction: ClusterByExonIdentity

Exemples au hotexamples.com: 2

Python Exons.ClusterByExonIdentity - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de CGAT.Exons.ClusterByExonIdentity extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

ReadExonBoundaries(13)

Alignment2Exons(8)

CheckOverlap(3)

ComparisonResult(3)

SetRankToPositionFlag(2)

CheckCoverage(2)

CheckCoverageAinB(2)

ClusterByExonIdentity(2)

CompareGeneStructures(2)

Alignment2ExonBoundaries(1)

GetExonsRange(1)

MatchExons(1)

GetPeptideLengths(1)

GetGenomeLengths(1)

CountMissedBoundaries(1)

GetExonBoundariesFromTable(1)

Exons2Alignment(1)

Exon(1)

ClusterByExonOverlap(1)

CheckContainedAinB(1)

CalculateStats(1)

UpdatePeptideCoordinates(1)

Méthodes fréquemment utilisées

ReadExonBoundaries (13)

Alignment2Exons (8)

CheckOverlap (3)

ComparisonResult (3)

SetRankToPositionFlag (2)

CheckCoverage (2)

CheckCoverageAinB (2)

ClusterByExonIdentity (2)

CompareGeneStructures (2)

Alignment2ExonBoundaries (1)

Méthodes fréquemment utilisées

GetExonsRange (1)

MatchExons (1)

GetPeptideLengths (1)

GetGenomeLengths (1)

CountMissedBoundaries (1)

GetExonBoundariesFromTable (1)

Exons2Alignment (1)

Exon (1)

ClusterByExonOverlap (1)

CheckContainedAinB (1)

CalculateStats (1)

UpdatePeptideCoordinates (1)

Méthodes fréquemment utilisées

CalculateStats (1)

UpdatePeptideCoordinates (1)

Exemple #1

0

Afficher le fichier

Fichier : exons2clusters.py Projet : yangjl/cgat

raise IOError("no exons in exon list.") Exons.SetRankToPositionFlag(exons) if param_use_genome_length: lengths = Exons.GetGenomeLengths(exons) else: lengths = Exons.GetPeptideLengths(exons) if param_min_overlap > 0: map_cluster2transcripts, map_transcript2cluster = ClusterByExonOverlap( exons, lengths, peptide_sequences, loglevel=param_loglevel) else: map_cluster2transcripts, map_transcript2cluster = \ Exons.ClusterByExonIdentity( exons, max_terminal_num_exons = 3, min_terminal_exon_coverage = param_min_terminal_exon_coverage, loglevel = param_loglevel ) map_transcript2strand = {} for k, ee in exons.items(): map_transcript2strand[k] = (ee[0].mSbjctStrand == "+") nnegatives, npositives = 0, 0 ## take longest transcript cluster_id = 1 if peptide_sequences: print "rep\tmem\tlength\tquery_from\tquery_to\tquery_ali\tsbjct_from\tsbjct_to\tsbjct_ali" else: print "rep\tmem\tlength"

Exemple #2

0

Afficher le fichier

Fichier : select_transcripts.py Projet : yangjl/cgat

def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gpipe/select_transcripts.py 2263 2008-11-17 16:36:29Z andreas $", usage = globals()["__doc__"] ) parser.add_option( "-o", "--overlap", dest="overlap_residues", type="int", help="overlap residues.") parser.add_option( "-t", "--filter-tokens", dest="filename_filter_tokens", type="string", help="filename to filter tokens." ) parser.add_option( "-i", "--exon-identity", dest="exon_identity", action="store_true", help="exon identity." ) parser.add_option( "--exons", dest="filename_exons", type="string", help="filename with exon information." ) parser.add_option( "-m", "--output-members", dest="filename_members", type="string", help="output filename with members." ) parser.add_option( "--overlap-id", dest="overlap_id", action="store_true", help="overlap id." ) parser.add_option( "-s", "--remove-spanning", dest="remove_spanning_predictions", action="store_true", help="remove spanning predictions." ) parser.add_option( "-c", "--remove-complement", dest="remove_complementary_predictions", action="store_true", help="remove complementary predictions." ) parser.add_option( "--remove-exon-swoppers", dest="remove_exon_swoppers", action="store_true", help="remove exon swoppers." ) parser.add_option( "--remove-gene-spanners", dest="remove_gene_spanners", action="store_true", help="remove gene spanners." ) parser.add_option( "--remove-suboptimal", dest="remove_suboptimal", action="store_true", help="remove suboptimal predictions." ) parser.add_option( "-p", "--peptides", dest="filename_peptides", type="string", help="filename with peptide information." ) parser.add_option( "--extended-peptides", dest="filename_extended_peptides", type="string", help="filename with peptide information - after extension." ) parser.add_option( "--test", dest="test_nids", type="string", help="test nids." ) ## filter options parser.add_option( "--filter-transcripts", dest="filter_filename_transcripts", type="string", help="filename with transcripts that are used to filter." ) parser.add_option( "--filter-remove-spanning", dest="filter_remove_spanning", action="store_true", help="remove all transcripts that span the filter set." ) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genomic data (indexed)." ) parser.add_option( "--discard-large-clusters", dest="discard_large_clusters", type="int", help="if set discard clusters bigger than this size (patch) [default=%default]." ) parser.set_defaults( filename_members = None, filename_peptides = None, filename_extended_peptides = None, filename_exons = None, quality_hierarchy = ("CG", "PG", "SG", "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK" ), ## Classes, where redundancy is removed by similarity. When exon structure ## is not conserved, I can't predict alternative splice variants, so remove ## the redundancy. quality_exclude_same = ( "UG", "UP", "UF", "BF", "UK" ), quality_genes = ("CG", "SG", "PG", "RG", "UG"), ## class that can be removed in spanning/complementary predictions quality_remove_dubious = ( "UG", "UP", "UF", "BF", "UK" ), ## class that is required for defining exon swopper event quality_remove_exon_swopper = ("CG", "PG"), ## class that will kept, in spite of being an exons swopper. quality_keep_exon_swopper = (), ## class that is required for removing gene spanners quality_remove_gene_spanners = ("CG"), ## class that will kept, in spite of being a gene spanner quality_keep_gene_spanners = (), ## class that is required for defining suboptimal matches quality_remove_suboptimal = ("CG", "PG" ), ## class that will be kept, in spite of being a suboptimal match quality_keep_suboptimal = (), ## gap penalties gop = -10.0, gep = -1.0, ## maximum number of gaps to allow in alignment max_gaps = 20, ## threshold of percent identity that allows to remove a prediction ## of a lower class. ## This allows for insertions/deletions min_identity = 98, ## threshold of percent identity that allows to remove a prediction ## of a non-gene by a gene min_identity_non_genes = 80, ## safety threshold: do not remove, if coverage of member is by x better ## than representative safety_pide = 10, safety_coverage = 10, overlap_id = False, remove_spanning_predictions = False, remove_exon_swoppers = False, remove_gene_spanners = False, remove_suboptimal = False, ## nids to use for testing test_nids = None, ## remove members with less than maximum coverage max_member_coverage = 90, ## maximum allowable exon slippage max_slippage = 9, ## minimum difference in identity for suboptimal predictions to be removed. suboptimal_min_identity_difference = 10, ## filter options filter_filename_transcripts = None, filter_remove_spanning = True, filter_remove_spanning_both_strands = True, genome_file = None, discard_large_clusters = None ) (options, args) = E.Start( parser, add_psql_options = True ) if options.test_nids: options.test_nids = options.test_nids.split(",") # list of eliminated predictions eliminated_predictions = {} if options.filename_members: outfile_members = open( options.filename_members, "w" ) else: outfile_members = sys.stdout ###################################################### ###################################################### ###################################################### # data ###################################################### data = [] class Entry: def __init__(self, gff): self.mPid = float(gff["pid"]) self.mQueryCoverage = float(gff["qcov"]) self.gene_id = gff['gene_id'] self.transcript_id = gff['transcript_id'] self.mExtendedStart = int( gff['xstart'] ) self.mExtendedEnd = int( gff['xend'] ) self.start = gff.start self.contig = gff.contig self.strand = gff.strand self.end = gff.end self.mQuality = gff['class'] for gff in GTF.iterator( sys.stdin ): data.append( Entry(gff) ) if options.loglevel >= 1: options.stdlog.write( "# read %i transcripts.\n" % len(data) ) options.stdlog.flush() ###################################################### ###################################################### ###################################################### # read peptide sequences ###################################################### if options.loglevel >= 1: options.stdlog.write( "# loading peptide databases ... " ) options.stdlog.flush() if options.filename_peptides: peptides = IndexedFasta.IndexedFasta( options.filename_peptides ) peptide_lengths = peptides.getContigSizes() else: peptide_lengths = {} peptides = {} ###################################################### ###################################################### ###################################################### # read extended peptide sequences ###################################################### if options.filename_extended_peptides: extended_peptides = IndexedFasta.IndexedFasta( options.filename_extended_peptides ) else: extended_peptides = {} if options.loglevel >= 1: options.stdlog.write( "finished\n" ) options.stdlog.flush() ###################################################### ###################################################### ###################################################### ## open genome file ###################################################### if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) contig_sizes = fasta.getContigSizes() else: contig_sizes = {} ###################################################### ###################################################### ###################################################### ## reading exons, clustering and formatting them. ###################################################### if options.filename_exons: if options.loglevel >= 1: options.stdlog.write( "# reading exon boundaries ... " ) options.stdlog.flush() ids = [ x.transcript_id for x in data ] exons = Exons.ReadExonBoundaries( open( options.filename_exons, "r"), contig_sizes = contig_sizes, filter = set(ids) ) if options.loglevel >= 1: options.stdlog.write( "done - read exons for %i transcripts\n" % (len(exons) )) if len(exons) == 0: raise ValueError("no exons found in table.") # flag terminal exons Exons.SetRankToPositionFlag( exons ) identity_map_cluster2transcripts, identity_map_transcript2cluster =\ Exons.ClusterByExonIdentity( exons, max_terminal_num_exons = 3, max_slippage= options.max_slippage, loglevel = options.loglevel ) overlap_map_cluster2transcripts, overlap_map_transcript2cluster =\ Exons.ClusterByExonOverlap( exons, min_overlap = 10, loglevel = options.loglevel ) else: exons = {} ###################################################### nrepresentatives, nmembers, neliminated = 0, 0, 0 eliminated_by_method = {} ###################################################### ###################################################### ###################################################### ## read filter transcripts and apply filters ###################################################### if options.filter_filename_transcripts: if options.loglevel >= 1: options.stdlog.write( "# reading exon boundaries for filter set ... " ) options.stdlog.flush() filter_exons = Exons.ReadExonBoundaries( open( options.filter_filename_transcripts, "r" ), delete_missing = True, contig_sizes = contig_sizes ) if options.loglevel >= 1: options.stdlog.write( "done - read exons for %i transcripts\n" % (len(filter_exons)) ) t = time.time() eliminated = FilterEliminateOverlappingTranscripts( exons, filter_exons, eliminated_predictions, contig_sizes, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i transcripts overlapping or spanning transcripts in %i seconds.\n" % (n, time.time()-t )) options.stdlog.flush() if options.remove_exon_swoppers and not exons: raise ValueError( "please specify exon table if using --remove-swoppers." ) if options.remove_gene_spanners and not exons: raise ValueError( "please specify exon table if using --remove-gene-spanners." ) ######################################################################################## ## remove predictions spanning other predictions but do not overlap with them on an exon level. if options.remove_gene_spanners and exons: if options.loglevel >= 1: options.stdlog.write( "# removing gene spanners\n" ) options.stdlog.flush() t = time.time() eliminated = EliminateGeneSpanners( data, eliminated_predictions, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i gene spanners in %i seconds\n" % (n, time.time()-t )) options.stdlog.flush() ######################################################################################## ## sort data by quality, length of prediction and coverage * pid if options.loglevel >= 1: options.stdlog.write( "# sorting data\n" ) options.stdlog.flush() map2pos = {} for x in range(len(options.quality_hierarchy)): map2pos[options.quality_hierarchy[x]] = x data.sort( key = lambda x: (map2pos[x.mQuality], len(extended_peptides[x.transcript_id]), x.mQueryCoverage * x.mPid ) ) # build map of prediction to quality map_prediction2data = {} for d in data: map_prediction2data[d.transcript_id] = d if options.loglevel >= 1: options.stdlog.write( "# sorting data finished\n" ) options.stdlog.flush() ######################################################################################## ## remove predictions joining two other complete non-overlapping predictions if options.remove_exon_swoppers and exons: if options.loglevel >= 1: options.stdlog.write( "# removing exon swoppers\n" ) options.stdlog.flush() eliminated = EliminateExonSwoppers( data, eliminated_predictions, identity_map_transcript2cluster, identity_map_cluster2transcripts, map_prediction2data, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i exon swoppers\n" % n ) options.stdlog.flush() ######################################################################################## ## remove suboptimal predictions if options.remove_suboptimal and exons: if options.loglevel >= 1: options.stdlog.write( "# removing suboptimal predictions\n" ) options.stdlog.flush() t = time.time() eliminated = EliminateSuboptimalPredictions( data, eliminated_predictions, overlap_map_transcript2cluster, overlap_map_cluster2transcripts, map_prediction2data, exons, options ) n = PrintMembers( 0, outfile_members, eliminated, eliminated_by_method ) neliminated += n if options.loglevel >= 1: options.stdlog.write( "# removed %i suboptimal predictions in %i seconds\n" % (n, time.time()-t) ) options.stdlog.flush() ######################################################################################## ## remove redundant predictions l = len(data) options.report_step = max(1, int(l / 100)) t2= time.time() last_quality = None qualities = [] options.stdout.write( "%s\t%s\n" % ("rep", "comment") ) for x in range(len(data)): if options.loglevel >= 1: if x % options.report_step == 0: options.stdlog.write( "# process: %i/%i = %i %%, %i/%i = %i %% in %i seconds\n" % \ (x+1, l, int(100 * (x+1) / l), len(eliminated_predictions), l, 100 * len(eliminated_predictions) / l, time.time() - t2 ) ) options.stdlog.flush() rep = data[x] rep_id, rep_quality = rep.transcript_id, rep.mQuality if rep_id in eliminated_predictions: continue if rep_quality != last_quality: if last_quality: qualities.append( last_quality ) last_quality = rep_quality if options.loglevel >= 2: options.stdlog.write( "# processing prediction %s|%s\n" % (rep_id, rep_quality) ) options.stdlog.flush() eliminated = [] if options.overlap_id: eliminated += EliminateRedundantEntriesByOverlap( rep, data[x+1:], eliminated_predictions, options, peptides, extended_peptides, filter_quality = qualities, this_quality = rep_quality ) else: eliminated += EliminateRedundantEntriesByRange( rep, data, eliminated_predictions, options, peptides, extended_peptides, filter_quality = qualities, this_quality = rep_quality ) options.stdout.write( "%s\t%i\n" % (rep_id, len(eliminated)) ) if outfile_members: outfile_members.write( "%s\t%s\tm\n" % (str(rep_id), str(rep_id))) nrepresentatives += 1 nmembers += PrintMembers( rep_id, outfile_members, eliminated, eliminated_by_method ) if outfile_members != sys.stdout: outfile_members.close() options.stdlog.write( "# representatives=%i, members=%i, eliminated=%i, total=%i\n" %\ (nrepresentatives, nmembers, neliminated, nrepresentatives+nmembers+neliminated ) ) options.stdlog.write( "# elimination by method:\n" ) for v,c in eliminated_by_method.items(): options.stdlog.write( "# method=%s, count=%i\n" % (v, c) ) E.Stop()