def confirmcanonicalsplicesites(sequence,gfflist,exon_fmethod=None,verbose=False): """ """ # sequence has original orientation, but exon coords may have been transferred to other strand onlycanonical = True warnings = [] exons = filtergffs4fmethod(gfflist,exon_fmethod) for i in range(1,len(exons)): donor,accep = exons[i-1:i+1] dSeq = sequence[donor[4]:donor[4]+2].upper() aSeq = sequence[accep[3]-3:accep[3]-1].upper() if dSeq not in CANONICAL_DONOR_SITES: onlycanonical = False if dSeq not in NON_CANONICAL_DONOR_SITES: warnings.append( ImplausibleSpliceSiteWarning("Donor:"+dSeq) ) else: warnings.append( NonCanonicalSpliceSiteWarning("Donor:"+dSeq) ) if verbose: print warnings[-1] if aSeq not in CANONICAL_ACCEPTOR_SITES: warnings.append( ImplausibleSpliceSiteWarning("Acceptor:"+aSeq) ) onlycanonical = False if verbose: print warnings[-1] # return the status of the observed splice sites return onlycanonical, warnings
def create_gene_utrs(gene_gff_list): """ Create UTR tracks for an (annotated) gene @type gene_gff_list: list @param gene_gff_list: list with gff tuples of the (annotated) gene @rtype utrs: list @return utrs: list with gff tuples of the UTR tracks @attention: requires global variables GFF_CDS_FMETHOD, GFF_EXON_FMETHOD @attention: requires global variables GFF_UTR5_FSOURCE, GFF_UTR5_FMETHOD @attention: requires global variables GFF_UTR3_FSOURCE, GFF_UTR3_FMETHOD """ # make sets of unigene coordinates cdscoords = gffs2coordset(gene_gff_list,fmethod=[GFF_CDS_FMETHOD]) exoncoords = gffs2coordset(gene_gff_list,fmethod=[GFF_EXON_FMETHOD]) # return list with UTR tracks utrs = [] # create a list with 5'UTR coordinates if cdscoords and exoncoords: utr5p_coords = [] utr3p_coords = [] # find start codon in cdscoords: min() for coord in range( min(exoncoords), min(cdscoords) ): if coord in exoncoords: utr5p_coords.append(coord) # find stop codon in exoncoords: max() if max(cdscoords)+3 != max(exoncoords): # check if exon-end != cds-end+3 -> only a stop codon for coord in range( max(cdscoords)+1, max(exoncoords)+1 ): if coord in exoncoords: utr3p_coords.append(coord) if utr5p_coords or utr3p_coords: # get list with coding exons for track backbone cexons = filtergffs4fmethod(gene_gff_list,fmethod=GFF_CDS_FMETHOD) # create 5'UTRs gfftrack = list( cexons[0] ) gfftrack[1] = GFF_UTR5_FSOURCE gfftrack[2] = GFF_UTR5_FMETHOD utrs.extend( coordset2gfftracks(utr5p_coords,gfftrack)) # create 3'UTRs gfftrack = list( cexons[0] ) gfftrack[1] = GFF_UTR3_FSOURCE gfftrack[2] = GFF_UTR3_FMETHOD utrs.extend( coordset2gfftracks(utr3p_coords,gfftrack)) # return the utr tracks return utrs
def annotatedgeneexonsizeevaluation(input,small_exon_nt_threshold=25,small_intron_nt_threshold=42): """ Check the size of the annotated coding exons @type input: dict @param input: input dictionary data structure @type verbose: Boolean @param verbose: print discrepancies to STDOUT (True) or not (False, default) @rtype: Boolean @return: are small coding exons present in the available annotations? """ HAS_TINY_EXONS = False keysandtype = [ ('gff-gene', 'orfid-genestructure', 'annotated gene'), ('gff-unigene', 'orfid-unigenestructure', 'unigene') ] for org in input.keys(): for (keyA,keyB,typeofevidence) in keysandtype: if input[org][keyA]: ###for gfftrack in input[org][keyA]: ### if gfftrack[2] != GFF_CDS_FMETHOD: continue codingexons = filtergffs4fmethod(input[org][keyA],GFF_CDS_FMETHOD) for pos in range(0,len(codingexons)): gfftrack = codingexons[pos] start, end = int(gfftrack[3])-1, int(gfftrack[4]) if ( end - start ) <= small_exon_nt_threshold: HAS_TINY_EXONS = True # a small exon! create a warning message message = "organism: '%s' %s (gff) start: %s end: %s length: %s" % ( org, typeofevidence, (start+1), end, (end - start) ) # a Try to find the id of the orf if input[org][keyB]: for id in input[org][keyB]: orf = input[org]['orfs'].get_orf_by_id(id) if orf.start <= start+1 and orf.end >= end: message+=" on %s" % (str(orf)) break else: pass # append the Warning input[org]['warnings'].append( SmallAnnotatedExonWarning(message) ) # check if it is a small First/Final exon if pos == 0: input[org]['warnings'].append( SmallAnnotatedFirstExonWarning(message) ) elif pos == len(codingexons)-1: input[org]['warnings'].append( SmallAnnotatedFinalExonWarning(message) ) # find small annotated introns for pos in range(1,len(codingexons)): gfftrackP = codingexons[pos-1] gfftrackN = codingexons[pos] sta = int(gfftrackP[4]) end = int(gfftrackN[3])-1 if end-sta < small_intron_nt_threshold: # a small intron! create a warning message message = "organism: '%s' %s (gff) start: %s end: %s length: %s" % ( org, typeofevidence, (sta+1), end, (end - sta) ) # append the Warning input[org]['warnings'].append( SmallAnnotatedIntronWarning(message) ) # return the Boolean status return HAS_TINY_EXONS
def geneconfirmation(input,verbose=False): """ Confirm given gene GFF structure with Orfs of the DNA sequence @type input: dict @param input: input dictionary data structure @type verbose: Boolean @param verbose: print discrepancies to STDOUT (True) or not (False, default) @rtype: tuple @return: tuple of ( input data structure, TRACKS_ARE_PROPERLY_MATCHED boolean ) """ input = readsequences(input) input = rungetorf(input) input = parseinputgff(input) GENE_TRACKS_ARE_PROPERLY_MATCHED = True for org in input.keys(): # (A) find the genestructure_orfmodel of the annotated/known gene structure input[org]['orfid-genestructure'] = [] _GENE_TRACKS_ARE_PROPERLY_MATCHED = False if input[org]['gff-gene']: # get only the CDS-type of tracks that define the coding sequence genecdstracks = filtergffs4fmethod( input[org]['gff-gene'], GFF_CDS_FMETHOD ) # for comparison / later use get exon tracks too geneexontracks = filtergffs4fmethod( input[org]['gff-gene'], GFF_EXON_FMETHOD ) if genecdstracks: # as expected... pass elif not genecdstracks and geneexontracks: # MANY GeneLoci miss CDS tracks but have exon tracks # In the BROAD annotation logic this represents partial genes: # based on similarity, but missing start and/or stop codon # In practice, these instances are likely the result of # sequence errors! # But, back to the problem we face here: what to do with these cases? # The choice that was made here is as follows: # - check the exon tracks as if they represent unigenes # - if OK -> convert the exon tracks to CDS tracks # - the GENE_TRACKS_ARE_PROPERLY_MATCHED will later in this # function we recognized as False # Append the Warning message message = "no tracks of type '%s'" % (GFF_CDS_FMETHOD) warn = IncompleteGeneStructureWarning(message) input[org]['warnings'].append( warn ) # obtain exon track's DNA sequence command = """%s %s %s | grep -v ">" | tr -d "\\n" """ % ( PYTHON_PATH, EXECUTABLE_GFF2FASTA, input[org]['genomeseqfile'] ) ci,co,ce = os.popen3(command) # make sure geneexontracks FREF matches input[org]['genomeseqfile'] FREF loci_dna_fref = open(input[org]['genomeseqfile']).readlines()[0].split(' ')[0].replace(">","") ci.write( gffs2txt(geneexontracks).replace( geneexontracks[0][0],loci_dna_fref ) ) ci.close() protseq = dna2protein( co.read().strip(), 0 ) co.close() error = ce.read() ce.close() if error.find("unrecognized header:") > -1: # rewrite GFF fref to genomic fref print "XXXX",error print command print gffs2txt(geneexontracks) print input[org]['genomefref'],input[org]['locusfref'],input[org]['proteinfref'] print open( input[org]['genomeseqfile']).readlines()[0] if protseq.count('*') == 0: # The EXON type tracks have an ORF in frame 0 # This represents a partial gene structure. # Copy the EXON tracks as CDS tracks and run # the rest of this function. for _track in geneexontracks: track = list(_track) track[2] = GFF_CDS_FMETHOD track = tuple(track) input[org]['gff-gene'].append( track ) genecdstracks.append( track ) else: message = "tracks of type '%s' have no ORF" % (GFF_EXON_FMETHOD) warn = IncompleteGeneStructureWarning(message) input[org]['warnings'].append( warn ) GENE_TRACKS_ARE_PROPERLY_MATCHED = False continue ################################################################ if verbose: print org, "NO CDS TRACKS!!", input[org]['FREF'] print protseq[0:20]+"..."+protseq[-20:], len(protseq) for track in genecdstracks: print track[0:7] ################################################################ else: # neither types of tracks -> no gene annotation at all! GENE_TRACKS_ARE_PROPERLY_MATCHED = False continue # correct the final exon for absence of STOP-codon # many given gene annotations do not include STOP-codon in final exon, some do # By definition: # A track of type CDS should EXCLUDE the stop codon triplet itself # A track of type exon should INCLUDE the stop codon triplet itself # Not all annotations follow this principle, however... # And, in ABFGP, it was chosen to use ANNOTATED CDS tracks as input and # PREDICTED EXON tracks as output. # This was done to have the stop-codon included as a check-final-check for # the final exon (if the predicted CDS is indeed followed by a stop codon). if not genestructurehasproperstopcodon(genecdstracks, input[org]['genomeseq'],GFF_CDS_FMETHOD,verbose=False): # Well, that is as expected -> we expect here CDS tracks # check if the NEXT triplet isa STOP-codon try: triplet = input[org]['genomeseq'][ genecdstracks[-1][4] : genecdstracks[-1][4]+3 ].lower() except: # here errors have occurred a lot! Some annotations mix up exon/CDS # tracks (and in some cases I mixed them up during the conversion # proces from annotation to GeneLoci). Print this exception ALWAYS! print "Exception occurred:" print org, "genecdstracks:", len(genecdstracks) triplet = input[org]['genomeseq'][ genecdstracks[-1][4] : genecdstracks[-1][4]+3 ].lower() if triplet in ['tga','tag','taa']: # update track by +3 length for STOP-codon # this update is only valid within the scope of this function; # input[org]['gff-gene'] remains unchanged. track = list( genecdstracks[-1] ) track[4]+=3 genecdstracks[-1] = tuple(track) else: # no CDS type of track followed by a stop-codon; # ignore here, error message will follow later pass elif genecdstracks and geneexontracks and\ genecdstracks[-1][3] == geneexontracks[-1][3] and\ genecdstracks[-1][4]+3 == geneexontracks[-1][4]: # do not correct anything; it's fine! pass else: # hmmm.... that is weird! We expect here CDS tracks, not EXON tracks # do a HARD correction on the provided annotation track! # TODO: this piece of code will fail for spliced stop codons: # Tgt.....agGA because it is spread over >1 track for pos in range(0,len(input[org]['gff-gene'])): track = input[org]['gff-gene'][pos] if track == genecdstracks[-1]: # HARD-replace in the input[org]['gff-gene'] list with gff tuples! track = list(track) track[4] = track[4]-3 input[org]['gff-gene'][pos] = tuple(track) break else: print "GeneStructureWarning: VERY UNUSUAL SPLITTED STOP CODON", org # convert exon/CDS tracks to UTR tracks utrs = create_gene_utrs(input[org]['gff-gene']) input[org]['gff-gene'].extend(utrs) # get the orfs of the genestructure orfids,_GENE_TRACKS_ARE_PROPERLY_MATCHED = genestructuregff2orfs(genecdstracks,input[org]['orfs'],verbose=False) if (not _GENE_TRACKS_ARE_PROPERLY_MATCHED or not orfids): print "_GENE_TRACKS_ARE_PROPERLY_MATCHED == False", org, orfids # redo just for logging purposes! orfids,_GENE_TRACKS_ARE_PROPERLY_MATCHED = genestructuregff2orfs(genecdstracks,input[org]['orfs'],verbose=True) # place the list with orfids in 'orfid-genestructure' dict key input[org]['orfid-genestructure'] = orfids # check if the gene was properly matched. if not _GENE_TRACKS_ARE_PROPERLY_MATCHED or not input[org]['orfid-genestructure']: GENE_TRACKS_ARE_PROPERLY_MATCHED = False # append warning to warnings list warn = GeneStructureIsNotMappableOnOrfsWarning("not mappable on orfs") input[org]['warnings'].append( warn ) # set known gene-tracks to empty list! input[org]['orfid-genestructure'] = [] # check for a potential sequence error in the sequence potentialsequenceerror(genecdstracks,input[org]['orfs'],verbose=verbose) else: # all fine. Label all the **known** Orfs with IS_ANNOTATED_EXON_LABEL for orfid in input[org]['orfid-genestructure']: orfObj = input[org]['orfs'].get_orf_by_id(orfid) # add label to Orf object setattr(orfObj,IS_ANNOTATED_EXON_LABEL,True) # vars for annotated start & stop codon check GENE_HAS_PROPER_START_CODON = False GENE_HAS_PROPER_STOP_CODON = False # (B) check start codon of the gene structure if _GENE_TRACKS_ARE_PROPERLY_MATCHED and input[org]['gff-gene']: GENE_HAS_PROPER_START_CODON = genestructurehasproperstartcodon( genecdstracks, input[org]['genomeseq'], GFF_CDS_FMETHOD, verbose=verbose) # create warning message if a problem occured if not GENE_HAS_PROPER_START_CODON: # create warning message try: sta,end = genecdstracks[0][3]-1, genecdstracks[0][3]+2 startdna = input[org]['genomeseq'][sta:end] orf = input[org]['orfs'].get_orf_by_id( input[org]['orfid-genestructure'][0] ) message = "no proper start codon (%s [%s] %s) on orf %s" % ( startdna, dna2protein(startdna,0),range(sta,end), orf) except: # coordinates are horribly wrong -> make simple warning message message = "no proper start codon (Exception)" # append warning to warnings list warn = IncompleteGeneStructureWarning(message) input[org]['warnings'].append( warn ) # (C) check stop codon of the gene structure if _GENE_TRACKS_ARE_PROPERLY_MATCHED and input[org]['gff-gene']: GENE_HAS_PROPER_STOP_CODON = genestructurehasproperstopcodon( genecdstracks, input[org]['genomeseq'], GFF_CDS_FMETHOD, verbose=verbose) if not GENE_HAS_PROPER_STOP_CODON: # create warning message try: sta,end = genecdstracks[-1][4]-3, genecdstracks[-1][4] stopdna = input[org]['genomeseq'][sta:end] message = "no proper stop codon (%s [%s] %s) on orf %s" % ( stopdna, dna2protein(stopdna,0),range(sta,end), input[org]['orfs'].get_orf_by_id(input[org]['orfid-genestructure'][-1]) ) except: # coordinates are horribly wrong -> make simple warning message message = "no proper stop codon (Exception)" # append warning to warnings list warn = IncompleteGeneStructureWarning(message) input[org]['warnings'].append( warn ) # (D) check if start_codon is present as annotated track if _GENE_TRACKS_ARE_PROPERLY_MATCHED and GENE_HAS_PROPER_START_CODON and\ input[org]['gff-gene'] and not filtergffs4fmethod( input[org]['gff-gene'], GFF_GENESTART_FMETHOD ): # create start_codon track # TODO: write a generic function that does the job, # TODO: whick takes splices start codons into account startcodon = list(genecdstracks[0]) startcodon[2] = GFF_GENESTART_FMETHOD startcodon[4] = startcodon[3]+2 startcodon[5] = "." startcodon[7] = "." startcodon[8] = startcodon[8].split("; ")[0] input[org]['gff-gene'].append( tuple(startcodon) ) ################################################################ if verbose: sta,end = genecdstracks[0][3]-1, genecdstracks[0][3]+2 startdna = input[org]['genomeseq'][sta:end] print "CREATED:", genecdstracks[0][0:7] print "CREATED:", tuple(startcodon) print "CREATED:", sta, end, startdna, dna2protein(startdna,0) ################################################################ # (E) check if stop_codon is present as annotated track if _GENE_TRACKS_ARE_PROPERLY_MATCHED and GENE_HAS_PROPER_STOP_CODON and\ input[org]['gff-gene'] and not filtergffs4fmethod( input[org]['gff-gene'], GFF_GENESTOP_FMETHOD ): # create stop_codon track stopcodon = list(genecdstracks[-1]) stopcodon[2] = GFF_GENESTOP_FMETHOD stopcodon[4] = stopcodon[4] stopcodon[3] = stopcodon[4]-2 stopcodon[5] = "." stopcodon[7] = "." stopcodon[8] = stopcodon[8].split("; ")[0] input[org]['gff-gene'].append( tuple(stopcodon) ) ################################################################ if verbose: sta,end = genecdstracks[-1][4]-3, genecdstracks[-1][4] stopdna = input[org]['genomeseq'][sta:end] print "CREATED:", genecdstracks[-1][0:7] print "CREATED:", tuple(stopcodon) print "CREATED:", sta, end, stopdna, dna2protein(stopdna,0) ################################################################ # (F) label the first & final (annotated) Orfs in the gene structure if GENE_HAS_PROPER_START_CODON and input[org]['orfid-genestructure']: firstorfid = input[org]['orfid-genestructure'][0] firstorf = input[org]['orfs'].get_orf_by_id(firstorfid) # add label to Orf object setattr(firstorf,FIRST_ANNOTATED_EXON_LABEL,True) if GENE_HAS_PROPER_STOP_CODON and input[org]['orfid-genestructure']: finalorfid = input[org]['orfid-genestructure'][-1] finalorf = input[org]['orfs'].get_orf_by_id(finalorfid) # add label to Orf object setattr(finalorf,FINAL_ANNOTATED_EXON_LABEL,True) # check if START & STOP codon are properly matched if not GENE_HAS_PROPER_START_CODON or not GENE_HAS_PROPER_STOP_CODON: _GENE_TRACKS_ARE_PROPERLY_MATCHED = False GENE_TRACKS_ARE_PROPERLY_MATCHED = False # print the negative outcome in verbose mode if not _GENE_TRACKS_ARE_PROPERLY_MATCHED and verbose: # set known gene-tracks to empty list! input[org]['orfid-genestructure'] = [] print "# WARNING: Gene is not mappable on Orfs:", org # return the incremented input dict and error status (True or False) return input, GENE_TRACKS_ARE_PROPERLY_MATCHED
def correct_unigene_for_utrs( unigene_gff_list, start_codon_gff=(), stop_codon_gff=(), minimal_likely_tss_pssm_score=3.0, shift_tss_pssm_score_ratio=4.0, dnaseqfname=None, verbose=False, ): """ Check if unigene contains evidence for non-coding UTRs and if so, correct @type unigene_gff_list: list @param unigene_gff_list: list with uncorrected unigene gff tuples @type start_codon_gff: tuple @param start_codon_gff: tuple representing the (annotated) protein's start codon @type stop_codon_gff: tuple @param stop_codon_gff: tuple representing the (annotated) protein's stop codon @type dnaseqfname: string (or None) @param dnaseqfname: filename of DNA sequence corresponding to the unigene's GFF @type minimal_likely_tss_pssm_score: float @param minimal_likely_tss_pssm_score: minimal (likely) PSSM score of the TSS @type shift_tss_pssm_score_ratio: float @param shift_tss_pssm_score_ratio: shift TSS downstream when ratio is exceeded @type verbose: Boolean @param verbose: print discrepancies to STDOUT (True) or not (False, default) @rtype: list of (gff) tuples + typeofunigene string @return: list with corrected gff tuples + string @attention: Global variable GFF_UGEXON_FMETHOD is required for this function @attention: Global variable GFF_UG3UTREXON_FMETHOD is required for this function @attention: Global variable GFF_UG5UTREXON_FMETHOD is required for this function """ # return list with corrected unigene tracks return_unigene_gff_list = [] start_codon_pos = None stop_codon_pos = None typeofunigene = None # make sets of unigene coordinates unigene_coordinate_set = gffs2coordset(unigene_gff_list, fmethod=[GFF_UGEXON_FMETHOD]) if dnaseqfname: # print unigene structure annotation unigeneexons = filtergffs4fmethod(unigene_gff_list, GFF_UGEXON_FMETHOD) unigeneexons.sort() # replace fasta header for correct recognition # header,descr = parseSingleFastaHeaderFromFile(dnaseqfname) header, dnaseq, descr = parseSingleFasta(open(dnaseqfname).readlines()) for i in range(0, len(unigeneexons)): gff = list(unigeneexons[i]) gff[0] = header # correct for negative coordinate. This can happen in case # the unigene sticks out of the genelocus if gff[3] <= 0: gff[3] = 1 unigeneexons[i] = tuple(gff) # run unigeneannotation command command = "%s %s %s" % (PYTHON_PATH, EXECUTABLE_UNIGENEANNOTATION, dnaseqfname) ci, co = os.popen2(command) ci.write(gffs2txt(unigeneexons)) ci.close() ugannotation = co.read().strip().split("\t") co.close() typeofunigene = ugannotation[0] # abstract coordinates of start and stop codon # from unigene annotation try: start_codon_pos = int(ugannotation[5]) except: start_codon_pos = None try: stop_codon_pos = int(ugannotation[6]) except: stop_codon_pos = None ################################################################ if verbose: for track in unigene_gff_list: print track print ugannotation, start_codon_pos, stop_codon_pos print "given ATG:", start_codon_gff print "given TGA:", stop_codon_gff ################################################################ if start_codon_pos: # check if the PythonRegex obtained Methionine is the most # likely TSS. When a far better one is available -> shift # the TSS downstream (5p->3p) to this better TSS. startcodons = [] for gffpos in range(start_codon_pos, int(unigeneexons[0][4]), 3): if dnaseq[gffpos - 1 : gffpos - 1 + 3].upper() == "ATG": tssSta = gffpos - 1 - IC_TSS_PATTERN_OFFSET[0] tssEnd = gffpos - 1 + 3 + IC_TSS_PATTERN_OFFSET[1] tssSeq = dnaseq[tssSta:tssEnd] tssSco = score_tss(tssSeq) # print 'ATG', gffpos, "%1.2f" % tssSco startcodons.append((tssSco, gffpos)) # check if there are >1 start codon posibilities if len(startcodons) > 1 and startcodons[0][0] < minimal_likely_tss_pssm_score: for score, gffpos in startcodons[1:]: if ( score >= minimal_likely_tss_pssm_score and abs(score / startcodons[0][0]) > shift_tss_pssm_score_ratio ): start_codon_pos = gffpos # print "TSS pos SHIFTED", startcodons[0][1], "->", gffpos # break out after first shift; this is now *THE* TSS break elif start_codon_gff: # unigene is a fragment or other transcript without # likely ATG. Fortunately, ATG is applied from the given # gene structure. Take this one. start_codon_pos = int(start_codon_gff[3]) else: # NO start_codon_pos available -> unigene fragment! pass elif start_codon_gff or stop_codon_gff: typeofunigene = None # unknown -> no unigeneannotation # No dna sequence is applied to verify the ATG/TGA # positions of the unigene by unigene annotation. # Abstract coordinates of start and/or stop codons # from the given coordinates (from the gene's annotation) if start_codon_gff: start_codon_pos = int(start_codon_gff[3]) if stop_codon_gff: stop_codon_pos = int(stop_codon_gff[4]) else: typeofunigene = None # unknown -> no unigeneannotation ######################################################## if verbose: print "NONE GIVEN seq/sta/end:", dnaseqfname print "gff ATG:", start_codon_gff print "gff TGA:", stop_codon_gff ######################################################## # no anchors applied in terms of start/stop sites # TODO future update: find or predict the putative orf # of this unigene. That specific functionallity should # NOT be placed in this function! # for the time being, just return the input gff list. return unigene_gff_list, typeofunigene # create an unigene stop codon track when in unigene_coordinate_set if stop_codon_pos and stop_codon_pos in unigene_coordinate_set: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = "UGstop" newgff[3] = stop_codon_pos - 2 newgff[4] = stop_codon_pos return_unigene_gff_list.append(tuple(newgff)) # CORRECT the unigene_coordinate_set for 5p nucleotides ignore_5p_coords = [] if ( start_codon_pos != None and start_codon_pos in unigene_coordinate_set and min(unigene_coordinate_set) < start_codon_pos ): if verbose: print "CREATE 5pUTR for ug:", typeofunigene # yes, there is a 5p unigene alignment part for coord in unigene_coordinate_set: if coord < start_codon_pos: # append to the ignore_5p_coords list ignore_5p_coords.append(coord) # remove from the unigene coord set for coord in ignore_5p_coords: unigene_coordinate_set.remove(coord) # CORRECT the unigene_coordinate_set for 3p nucleotides ignore_3p_coords = [] if ( stop_codon_pos != None and stop_codon_pos in unigene_coordinate_set and max(unigene_coordinate_set) > stop_codon_pos ): if verbose: print "CREATE 3pUTR for ug:", typeofunigene # yes, there is a 3p unigene alignment part for coord in unigene_coordinate_set: if coord > stop_codon_pos: # append to the ignore_5p_coords list ignore_3p_coords.append(coord) # remove from the unigene coord set for coord in ignore_3p_coords: unigene_coordinate_set.remove(coord) #### remove the stop codon position too ###unigene_coordinate_set.remove(stop_codon_pos-2) ###unigene_coordinate_set.remove(stop_codon_pos-1) ###unigene_coordinate_set.remove(stop_codon_pos) # make (new) UGExon tracks, corrected for UTRS, if needed if not (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set: # no utrs available; just set the input to the output list return_unigene_gff_list.extend(unigene_gff_list) elif (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set: # create new gff tracks for unigene exons unigene_exon_coords = list(unigene_coordinate_set) unigene_exon_coords.sort() track_coords = [[unigene_exon_coords[0]]] for coord in unigene_exon_coords[1:]: if coord == max(track_coords[-1]) + 1: track_coords[-1].append(coord) else: track_coords.append([coord]) for track in track_coords: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[3] = min(track) newgff[4] = max(track) # and append to the new return unigene gff list return_unigene_gff_list.append(tuple(newgff)) # make UTR5UGExon track if it exists if ignore_5p_coords: ignore_5p_coords.sort() tracks = [[ignore_5p_coords[0]]] for coord in ignore_5p_coords[1:]: if coord == max(tracks[-1]) + 1: tracks[-1].append(coord) else: tracks.append([coord]) # reverse tracks; if there are >1, inserting in the # return list will guarantee the correct order tracks.reverse() for track in tracks: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = GFF_UG5UTREXON_FMETHOD newgff[3] = min(track) newgff[4] = max(track) # and insert as the first the new return unigene gff list return_unigene_gff_list.insert(0, tuple(newgff)) # make UTR3UGExon track if ignore_3p_coords: ignore_3p_coords.sort() tracks = [[ignore_3p_coords[0]]] for coord in ignore_3p_coords[1:]: if coord == max(tracks[-1]) + 1: tracks[-1].append(coord) else: tracks.append([coord]) for track in tracks: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = GFF_UG3UTREXON_FMETHOD newgff[3] = min(track) newgff[4] = max(track) # and append to the new return unigene gff list return_unigene_gff_list.append(tuple(newgff)) else: # hmm... not really expected. There are UniGene tracks, # but no UniGene exons are recognized. Probably a wrong setting # applied for GFF_UGEXON_FMETHOD (not identical to the naming in # the input gff. pass # order the unigene gff list (stop codon potentially on the front return_unigene_gff_list = order_gff_list(return_unigene_gff_list) ################################################################ if verbose and (ignore_5p_coords or ignore_3p_coords): for track in return_unigene_gff_list: print track ################################################################ # done! return the new list return return_unigene_gff_list, typeofunigene
def correct_unigene_for_utrs(unigene_gff_list, start_codon_gff=(), stop_codon_gff=(), minimal_likely_tss_pssm_score=3.0, shift_tss_pssm_score_ratio=4.0, dnaseqfname=None, verbose=False): """ Check if unigene contains evidence for non-coding UTRs and if so, correct @type unigene_gff_list: list @param unigene_gff_list: list with uncorrected unigene gff tuples @type start_codon_gff: tuple @param start_codon_gff: tuple representing the (annotated) protein's start codon @type stop_codon_gff: tuple @param stop_codon_gff: tuple representing the (annotated) protein's stop codon @type dnaseqfname: string (or None) @param dnaseqfname: filename of DNA sequence corresponding to the unigene's GFF @type minimal_likely_tss_pssm_score: float @param minimal_likely_tss_pssm_score: minimal (likely) PSSM score of the TSS @type shift_tss_pssm_score_ratio: float @param shift_tss_pssm_score_ratio: shift TSS downstream when ratio is exceeded @type verbose: Boolean @param verbose: print discrepancies to STDOUT (True) or not (False, default) @rtype: list of (gff) tuples + typeofunigene string @return: list with corrected gff tuples + string @attention: Global variable GFF_UGEXON_FMETHOD is required for this function @attention: Global variable GFF_UG3UTREXON_FMETHOD is required for this function @attention: Global variable GFF_UG5UTREXON_FMETHOD is required for this function """ # return list with corrected unigene tracks return_unigene_gff_list = [] start_codon_pos = None stop_codon_pos = None typeofunigene = None # make sets of unigene coordinates unigene_coordinate_set = gffs2coordset(unigene_gff_list, fmethod=[GFF_UGEXON_FMETHOD]) if dnaseqfname: # print unigene structure annotation unigeneexons = filtergffs4fmethod(unigene_gff_list, GFF_UGEXON_FMETHOD) unigeneexons.sort() # replace fasta header for correct recognition #header,descr = parseSingleFastaHeaderFromFile(dnaseqfname) header, dnaseq, descr = parseSingleFasta(open(dnaseqfname).readlines()) for i in range(0, len(unigeneexons)): gff = list(unigeneexons[i]) gff[0] = header # correct for negative coordinate. This can happen in case # the unigene sticks out of the genelocus if gff[3] <= 0: gff[3] = 1 unigeneexons[i] = tuple(gff) # run unigeneannotation command command = "%s %s %s" % (PYTHON_PATH, EXECUTABLE_UNIGENEANNOTATION, dnaseqfname) ci, co = os.popen2(command) ci.write(gffs2txt(unigeneexons)) ci.close() ugannotation = co.read().strip().split("\t") co.close() typeofunigene = ugannotation[0] # abstract coordinates of start and stop codon # from unigene annotation try: start_codon_pos = int(ugannotation[5]) except: start_codon_pos = None try: stop_codon_pos = int(ugannotation[6]) except: stop_codon_pos = None ################################################################ if verbose: for track in unigene_gff_list: print track print ugannotation, start_codon_pos, stop_codon_pos print "given ATG:", start_codon_gff print "given TGA:", stop_codon_gff ################################################################ if start_codon_pos: # check if the PythonRegex obtained Methionine is the most # likely TSS. When a far better one is available -> shift # the TSS downstream (5p->3p) to this better TSS. startcodons = [] for gffpos in range(start_codon_pos, int(unigeneexons[0][4]), 3): if dnaseq[gffpos - 1:gffpos - 1 + 3].upper() == 'ATG': tssSta = gffpos - 1 - IC_TSS_PATTERN_OFFSET[0] tssEnd = gffpos - 1 + 3 + IC_TSS_PATTERN_OFFSET[1] tssSeq = dnaseq[tssSta:tssEnd] tssSco = score_tss(tssSeq) #print 'ATG', gffpos, "%1.2f" % tssSco startcodons.append((tssSco, gffpos)) # check if there are >1 start codon posibilities if len(startcodons) > 1 and startcodons[0][0] <\ minimal_likely_tss_pssm_score: for score, gffpos in startcodons[1:]: if score >= minimal_likely_tss_pssm_score and\ abs( score / startcodons[0][0] ) > shift_tss_pssm_score_ratio: start_codon_pos = gffpos #print "TSS pos SHIFTED", startcodons[0][1], "->", gffpos # break out after first shift; this is now *THE* TSS break elif start_codon_gff: # unigene is a fragment or other transcript without # likely ATG. Fortunately, ATG is applied from the given # gene structure. Take this one. start_codon_pos = int(start_codon_gff[3]) else: # NO start_codon_pos available -> unigene fragment! pass elif start_codon_gff or stop_codon_gff: typeofunigene = None # unknown -> no unigeneannotation # No dna sequence is applied to verify the ATG/TGA # positions of the unigene by unigene annotation. # Abstract coordinates of start and/or stop codons # from the given coordinates (from the gene's annotation) if start_codon_gff: start_codon_pos = int(start_codon_gff[3]) if stop_codon_gff: stop_codon_pos = int(stop_codon_gff[4]) else: typeofunigene = None # unknown -> no unigeneannotation ######################################################## if verbose: print "NONE GIVEN seq/sta/end:", dnaseqfname print "gff ATG:", start_codon_gff print "gff TGA:", stop_codon_gff ######################################################## # no anchors applied in terms of start/stop sites # TODO future update: find or predict the putative orf # of this unigene. That specific functionallity should # NOT be placed in this function! # for the time being, just return the input gff list. return unigene_gff_list, typeofunigene # create an unigene stop codon track when in unigene_coordinate_set if stop_codon_pos and stop_codon_pos in unigene_coordinate_set: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = 'UGstop' newgff[3] = stop_codon_pos - 2 newgff[4] = stop_codon_pos return_unigene_gff_list.append(tuple(newgff)) # CORRECT the unigene_coordinate_set for 5p nucleotides ignore_5p_coords = [] if start_codon_pos != None and start_codon_pos in\ unigene_coordinate_set and min(unigene_coordinate_set) < start_codon_pos: if verbose: print "CREATE 5pUTR for ug:", typeofunigene # yes, there is a 5p unigene alignment part for coord in unigene_coordinate_set: if coord < start_codon_pos: # append to the ignore_5p_coords list ignore_5p_coords.append(coord) # remove from the unigene coord set for coord in ignore_5p_coords: unigene_coordinate_set.remove(coord) # CORRECT the unigene_coordinate_set for 3p nucleotides ignore_3p_coords = [] if stop_codon_pos != None and stop_codon_pos in\ unigene_coordinate_set and max(unigene_coordinate_set) > stop_codon_pos: if verbose: print "CREATE 3pUTR for ug:", typeofunigene # yes, there is a 3p unigene alignment part for coord in unigene_coordinate_set: if coord > stop_codon_pos: # append to the ignore_5p_coords list ignore_3p_coords.append(coord) # remove from the unigene coord set for coord in ignore_3p_coords: unigene_coordinate_set.remove(coord) #### remove the stop codon position too ###unigene_coordinate_set.remove(stop_codon_pos-2) ###unigene_coordinate_set.remove(stop_codon_pos-1) ###unigene_coordinate_set.remove(stop_codon_pos) # make (new) UGExon tracks, corrected for UTRS, if needed if not (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set: # no utrs available; just set the input to the output list return_unigene_gff_list.extend(unigene_gff_list) elif (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set: # create new gff tracks for unigene exons unigene_exon_coords = list(unigene_coordinate_set) unigene_exon_coords.sort() track_coords = [[unigene_exon_coords[0]]] for coord in unigene_exon_coords[1:]: if coord == max(track_coords[-1]) + 1: track_coords[-1].append(coord) else: track_coords.append([coord]) for track in track_coords: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[3] = min(track) newgff[4] = max(track) # and append to the new return unigene gff list return_unigene_gff_list.append(tuple(newgff)) # make UTR5UGExon track if it exists if ignore_5p_coords: ignore_5p_coords.sort() tracks = [[ignore_5p_coords[0]]] for coord in ignore_5p_coords[1:]: if coord == max(tracks[-1]) + 1: tracks[-1].append(coord) else: tracks.append([coord]) # reverse tracks; if there are >1, inserting in the # return list will guarantee the correct order tracks.reverse() for track in tracks: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = GFF_UG5UTREXON_FMETHOD newgff[3] = min(track) newgff[4] = max(track) # and insert as the first the new return unigene gff list return_unigene_gff_list.insert(0, tuple(newgff)) # make UTR3UGExon track if ignore_3p_coords: ignore_3p_coords.sort() tracks = [[ignore_3p_coords[0]]] for coord in ignore_3p_coords[1:]: if coord == max(tracks[-1]) + 1: tracks[-1].append(coord) else: tracks.append([coord]) for track in tracks: # make a deepcopy of the first unigene exon track and make a list of it newgff = list(deepcopy(unigene_gff_list[0])) # update the coordinates newgff[2] = GFF_UG3UTREXON_FMETHOD newgff[3] = min(track) newgff[4] = max(track) # and append to the new return unigene gff list return_unigene_gff_list.append(tuple(newgff)) else: # hmm... not really expected. There are UniGene tracks, # but no UniGene exons are recognized. Probably a wrong setting # applied for GFF_UGEXON_FMETHOD (not identical to the naming in # the input gff. pass # order the unigene gff list (stop codon potentially on the front return_unigene_gff_list = order_gff_list(return_unigene_gff_list) ################################################################ if verbose and (ignore_5p_coords or ignore_3p_coords): for track in return_unigene_gff_list: print track ################################################################ # done! return the new list return return_unigene_gff_list, typeofunigene