Esempio n. 1
0
def confirmcanonicalsplicesites(sequence,gfflist,exon_fmethod=None,verbose=False):
    """
    """
    # sequence has original orientation, but exon coords may have been transferred to other strand
    onlycanonical = True
    warnings = []
    exons = filtergffs4fmethod(gfflist,exon_fmethod)
    for i in range(1,len(exons)):
        donor,accep = exons[i-1:i+1]
        dSeq = sequence[donor[4]:donor[4]+2].upper()
        aSeq = sequence[accep[3]-3:accep[3]-1].upper()
        if dSeq not in CANONICAL_DONOR_SITES:
            onlycanonical = False
            if dSeq not in NON_CANONICAL_DONOR_SITES:
                warnings.append( ImplausibleSpliceSiteWarning("Donor:"+dSeq) )
            else:
                warnings.append( NonCanonicalSpliceSiteWarning("Donor:"+dSeq) )
            if verbose: print warnings[-1]
        if aSeq not in CANONICAL_ACCEPTOR_SITES:
            warnings.append( ImplausibleSpliceSiteWarning("Acceptor:"+aSeq) )
            onlycanonical = False
            if verbose: print warnings[-1]

    # return the status of the observed splice sites
    return onlycanonical, warnings
Esempio n. 2
0
def create_gene_utrs(gene_gff_list):
    """
    Create UTR tracks for an (annotated) gene

    @type  gene_gff_list: list
    @param gene_gff_list: list with gff tuples of the (annotated) gene

    @rtype  utrs: list
    @return utrs: list with gff tuples of the UTR tracks

    @attention: requires global variables GFF_CDS_FMETHOD, GFF_EXON_FMETHOD
    @attention: requires global variables GFF_UTR5_FSOURCE, GFF_UTR5_FMETHOD
    @attention: requires global variables GFF_UTR3_FSOURCE, GFF_UTR3_FMETHOD
    """
    # make sets of unigene coordinates
    cdscoords  = gffs2coordset(gene_gff_list,fmethod=[GFF_CDS_FMETHOD])
    exoncoords = gffs2coordset(gene_gff_list,fmethod=[GFF_EXON_FMETHOD])

    # return list with UTR tracks
    utrs = []
    
    # create a list with 5'UTR coordinates
    if cdscoords and exoncoords:
        utr5p_coords = []
        utr3p_coords = []

        # find start codon in cdscoords: min()
        for coord in range( min(exoncoords), min(cdscoords) ):
            if coord in exoncoords:
                utr5p_coords.append(coord)

        # find stop codon in exoncoords: max()
        if max(cdscoords)+3 != max(exoncoords):
            # check if exon-end != cds-end+3 -> only a stop codon
            for coord in range( max(cdscoords)+1, max(exoncoords)+1 ):
                if coord in exoncoords:
                    utr3p_coords.append(coord)

        if utr5p_coords or utr3p_coords:
            # get list with coding exons for track backbone
            cexons = filtergffs4fmethod(gene_gff_list,fmethod=GFF_CDS_FMETHOD)

            # create 5'UTRs
            gfftrack    = list( cexons[0] )
            gfftrack[1] = GFF_UTR5_FSOURCE
            gfftrack[2] = GFF_UTR5_FMETHOD
            utrs.extend( coordset2gfftracks(utr5p_coords,gfftrack))

            # create 3'UTRs
            gfftrack    = list( cexons[0] )
            gfftrack[1] = GFF_UTR3_FSOURCE
            gfftrack[2] = GFF_UTR3_FMETHOD
            utrs.extend( coordset2gfftracks(utr3p_coords,gfftrack))

    # return the utr tracks
    return utrs
Esempio n. 3
0
def annotatedgeneexonsizeevaluation(input,small_exon_nt_threshold=25,small_intron_nt_threshold=42):
    """
    Check the size of the annotated coding exons
    
    @type  input: dict
    @param input: input dictionary data structure

    @type  verbose: Boolean
    @param verbose: print discrepancies to STDOUT (True) or not (False, default)

    @rtype:  Boolean
    @return: are small coding exons present in the available annotations?
    """
    HAS_TINY_EXONS = False
    keysandtype = [
            ('gff-gene', 'orfid-genestructure', 'annotated gene'),
            ('gff-unigene', 'orfid-unigenestructure', 'unigene')
            ]
    for org in input.keys(): 
        for (keyA,keyB,typeofevidence) in keysandtype:
            if input[org][keyA]:
                ###for gfftrack in input[org][keyA]:
                ###    if gfftrack[2] != GFF_CDS_FMETHOD: continue
                codingexons = filtergffs4fmethod(input[org][keyA],GFF_CDS_FMETHOD)
                for pos in range(0,len(codingexons)):
                    gfftrack = codingexons[pos]
                    start, end = int(gfftrack[3])-1, int(gfftrack[4])
                    if ( end - start ) <= small_exon_nt_threshold:
                        HAS_TINY_EXONS = True

                        # a small exon! create a warning message
                        message = "organism: '%s' %s (gff) start: %s end: %s length: %s" % (
                                    org, typeofevidence, (start+1), end,  (end - start) )
                        # a Try to find the id of the orf
                        if input[org][keyB]:
                            for id in input[org][keyB]:
                                orf = input[org]['orfs'].get_orf_by_id(id)
                                if orf.start <= start+1 and orf.end >= end:
                                    message+=" on %s" % (str(orf))
                                    break
                        else:
                            pass

                        # append the Warning
                        input[org]['warnings'].append( SmallAnnotatedExonWarning(message) )

                        # check if it is a small First/Final exon
                        if pos == 0:
                            input[org]['warnings'].append( SmallAnnotatedFirstExonWarning(message) )
                        elif pos == len(codingexons)-1:
                            input[org]['warnings'].append( SmallAnnotatedFinalExonWarning(message) )

                # find small annotated introns
                for pos in range(1,len(codingexons)):
                    gfftrackP = codingexons[pos-1]
                    gfftrackN = codingexons[pos]
                    sta = int(gfftrackP[4])
                    end = int(gfftrackN[3])-1
                    if end-sta < small_intron_nt_threshold:
                        # a small intron! create a warning message
                        message = "organism: '%s' %s (gff) start: %s end: %s length: %s" % (
                                    org, typeofevidence, (sta+1), end,  (end - sta) )
                        # append the Warning
                        input[org]['warnings'].append( SmallAnnotatedIntronWarning(message) )



    # return the Boolean status
    return HAS_TINY_EXONS
Esempio n. 4
0
def geneconfirmation(input,verbose=False):
    """
    Confirm given gene GFF structure with Orfs of the DNA sequence

    @type  input: dict
    @param input: input dictionary data structure

    @type  verbose: Boolean
    @param verbose: print discrepancies to STDOUT (True) or not (False, default)

    @rtype:  tuple 
    @return: tuple of ( input data structure, TRACKS_ARE_PROPERLY_MATCHED boolean )
    """
    input = readsequences(input)
    input = rungetorf(input)
    input = parseinputgff(input)

    GENE_TRACKS_ARE_PROPERLY_MATCHED = True
    for org in input.keys():

        # (A) find the genestructure_orfmodel of the annotated/known gene structure
        input[org]['orfid-genestructure'] = []
        _GENE_TRACKS_ARE_PROPERLY_MATCHED = False
        if input[org]['gff-gene']:
            # get only the CDS-type of tracks that define the coding sequence
            genecdstracks = filtergffs4fmethod( input[org]['gff-gene'], GFF_CDS_FMETHOD ) 

            # for comparison / later use get exon tracks too
            geneexontracks = filtergffs4fmethod( input[org]['gff-gene'], GFF_EXON_FMETHOD )

            if genecdstracks:
                # as expected...
                pass
            elif not genecdstracks and geneexontracks:
                # MANY GeneLoci miss CDS tracks but have exon tracks
                # In the BROAD annotation logic this represents partial genes:
                # based on similarity, but missing start and/or stop codon
                # In practice, these instances are likely the result of
                # sequence errors!
                # But, back to the problem we face here: what to do with these cases?
                # The choice that was made here is as follows:
                # - check the exon tracks as if they represent unigenes
                # - if OK -> convert the exon tracks to CDS tracks
                # - the GENE_TRACKS_ARE_PROPERLY_MATCHED will later in this
                #   function we recognized as False 

                # Append the Warning message
                message = "no tracks of type '%s'" % (GFF_CDS_FMETHOD)
                warn = IncompleteGeneStructureWarning(message)
                input[org]['warnings'].append( warn )

                # obtain exon track's DNA sequence
                command = """%s %s %s | grep -v ">" | tr -d "\\n" """ % ( PYTHON_PATH,
                                         EXECUTABLE_GFF2FASTA,
                                         input[org]['genomeseqfile'] )
                ci,co,ce = os.popen3(command)
                # make sure geneexontracks FREF matches input[org]['genomeseqfile'] FREF
                loci_dna_fref = open(input[org]['genomeseqfile']).readlines()[0].split(' ')[0].replace(">","") 
                ci.write( gffs2txt(geneexontracks).replace( geneexontracks[0][0],loci_dna_fref ) )
                ci.close()
                protseq = dna2protein( co.read().strip(), 0 )
                co.close()
                error = ce.read()
                ce.close()

                if error.find("unrecognized header:") > -1:
                    # rewrite GFF fref to genomic fref
                    print "XXXX",error
                    print command
                    print gffs2txt(geneexontracks)
                    print input[org]['genomefref'],input[org]['locusfref'],input[org]['proteinfref']
                    print open( input[org]['genomeseqfile']).readlines()[0]

                if protseq.count('*') == 0:
                    # The EXON type tracks have an ORF in frame 0
                    # This represents a partial gene structure.
                    # Copy the EXON tracks as CDS tracks and run
                    # the rest of this function.
                    for _track in geneexontracks:
                        track = list(_track)
                        track[2] = GFF_CDS_FMETHOD
                        track = tuple(track)
                        input[org]['gff-gene'].append( track )
                        genecdstracks.append( track ) 
                else:
                    message = "tracks of type '%s' have no ORF" % (GFF_EXON_FMETHOD)
                    warn = IncompleteGeneStructureWarning(message)
                    input[org]['warnings'].append( warn )
                    GENE_TRACKS_ARE_PROPERLY_MATCHED = False
                    continue

                ################################################################
                if verbose:
                    print org, "NO CDS TRACKS!!", input[org]['FREF']
                    print protseq[0:20]+"..."+protseq[-20:], len(protseq)
                    for track in genecdstracks: print track[0:7]
                ################################################################

            else:
                # neither types of tracks -> no gene annotation at all! 
                GENE_TRACKS_ARE_PROPERLY_MATCHED = False
                continue

            # correct the final exon for absence of STOP-codon
            # many given gene annotations do not include STOP-codon in final exon, some do
            # By definition:
            # A track of type CDS  should EXCLUDE the stop codon triplet itself
            # A track of type exon should INCLUDE the stop codon triplet itself
            # Not all annotations follow this principle, however...
            # And, in ABFGP, it was chosen to use ANNOTATED CDS tracks as input and
            # PREDICTED EXON tracks as output.
            # This was done to have the stop-codon included as a check-final-check for
            # the final exon (if the predicted CDS is indeed followed by a stop codon). 

            if not genestructurehasproperstopcodon(genecdstracks,
            input[org]['genomeseq'],GFF_CDS_FMETHOD,verbose=False):
                # Well, that is as expected -> we expect here CDS tracks
                # check if the NEXT triplet isa STOP-codon
                try:
                    triplet = input[org]['genomeseq'][ genecdstracks[-1][4] : genecdstracks[-1][4]+3 ].lower()
                except:
                    # here errors have occurred a lot! Some annotations mix up exon/CDS
                    # tracks (and in some cases I mixed them up during the conversion
                    # proces from annotation to GeneLoci). Print this exception ALWAYS!
                    print "Exception occurred:"
                    print org, "genecdstracks:", len(genecdstracks)
                    triplet = input[org]['genomeseq'][ genecdstracks[-1][4] : genecdstracks[-1][4]+3 ].lower()

                if triplet in ['tga','tag','taa']:
                    # update track by +3 length for STOP-codon
                    # this update is only valid within the scope of this function;
                    # input[org]['gff-gene'] remains unchanged.
                    track = list( genecdstracks[-1] )
                    track[4]+=3
                    genecdstracks[-1] = tuple(track)
                else:
                    # no CDS type of track followed by a stop-codon;
                    # ignore here, error message will follow later
                    pass

            elif genecdstracks and geneexontracks and\
            genecdstracks[-1][3] == geneexontracks[-1][3] and\
            genecdstracks[-1][4]+3 == geneexontracks[-1][4]:
                # do not correct anything; it's fine!
                pass

            else:
                # hmmm.... that is weird! We expect here CDS tracks, not EXON tracks
                # do a HARD correction on the provided annotation track!
                # TODO: this piece of code will fail for spliced stop codons:
                # Tgt.....agGA because it is spread over >1 track            
                for pos in range(0,len(input[org]['gff-gene'])):
                    track = input[org]['gff-gene'][pos]
                    if track == genecdstracks[-1]:
                        # HARD-replace in the input[org]['gff-gene'] list with gff tuples!
                        track = list(track)
                        track[4] = track[4]-3
                        input[org]['gff-gene'][pos] = tuple(track)
                        break
                else:
                    print "GeneStructureWarning: VERY UNUSUAL SPLITTED STOP CODON", org

            # convert exon/CDS tracks to UTR tracks
            utrs = create_gene_utrs(input[org]['gff-gene'])
            input[org]['gff-gene'].extend(utrs)

            # get the orfs of the genestructure
            orfids,_GENE_TRACKS_ARE_PROPERLY_MATCHED = genestructuregff2orfs(genecdstracks,input[org]['orfs'],verbose=False)
            if (not _GENE_TRACKS_ARE_PROPERLY_MATCHED or not orfids):
                print "_GENE_TRACKS_ARE_PROPERLY_MATCHED == False", org, orfids
                # redo just for logging purposes!
                orfids,_GENE_TRACKS_ARE_PROPERLY_MATCHED = genestructuregff2orfs(genecdstracks,input[org]['orfs'],verbose=True)

            # place the list with orfids in 'orfid-genestructure' dict key 
            input[org]['orfid-genestructure'] = orfids
            # check if the gene was properly matched.
            if not _GENE_TRACKS_ARE_PROPERLY_MATCHED or not input[org]['orfid-genestructure']:
                GENE_TRACKS_ARE_PROPERLY_MATCHED = False
                # append warning to warnings list
                warn = GeneStructureIsNotMappableOnOrfsWarning("not mappable on orfs")
                input[org]['warnings'].append( warn )
                # set known gene-tracks to empty list!
                input[org]['orfid-genestructure'] = []
                # check for a potential sequence error in the sequence
                potentialsequenceerror(genecdstracks,input[org]['orfs'],verbose=verbose)
            else:
                # all fine. Label all the **known** Orfs with IS_ANNOTATED_EXON_LABEL
                for orfid in input[org]['orfid-genestructure']:
                    orfObj = input[org]['orfs'].get_orf_by_id(orfid)
                    # add label to Orf object
                    setattr(orfObj,IS_ANNOTATED_EXON_LABEL,True)


        # vars for annotated start & stop codon check
        GENE_HAS_PROPER_START_CODON = False
        GENE_HAS_PROPER_STOP_CODON  = False

        # (B) check start codon of the gene structure
        if _GENE_TRACKS_ARE_PROPERLY_MATCHED and input[org]['gff-gene']:
            GENE_HAS_PROPER_START_CODON = genestructurehasproperstartcodon(
                    genecdstracks,
                    input[org]['genomeseq'],
                    GFF_CDS_FMETHOD,
                    verbose=verbose)
            # create warning message if a problem occured 
            if not GENE_HAS_PROPER_START_CODON:
                # create warning message
                try:
                    sta,end  = genecdstracks[0][3]-1, genecdstracks[0][3]+2
                    startdna = input[org]['genomeseq'][sta:end]
                    orf      = input[org]['orfs'].get_orf_by_id(
                                   input[org]['orfid-genestructure'][0] )
                    message  = "no proper start codon (%s [%s] %s) on orf %s" % (
                        startdna, dna2protein(startdna,0),range(sta,end), orf)
                except:
                    # coordinates are horribly wrong -> make simple warning message
                    message = "no proper start codon (Exception)"
                # append warning to warnings list
                warn = IncompleteGeneStructureWarning(message)
                input[org]['warnings'].append( warn )


        # (C) check stop codon of the gene structure
        if _GENE_TRACKS_ARE_PROPERLY_MATCHED and input[org]['gff-gene']:
            GENE_HAS_PROPER_STOP_CODON = genestructurehasproperstopcodon(
                    genecdstracks,
                    input[org]['genomeseq'],
                    GFF_CDS_FMETHOD,
                    verbose=verbose)
            if not GENE_HAS_PROPER_STOP_CODON:
                # create warning message
                try:
                    sta,end = genecdstracks[-1][4]-3, genecdstracks[-1][4]
                    stopdna = input[org]['genomeseq'][sta:end]
                    message = "no proper stop codon (%s [%s] %s) on orf %s" % (
                        stopdna, dna2protein(stopdna,0),range(sta,end),
                        input[org]['orfs'].get_orf_by_id(input[org]['orfid-genestructure'][-1])
                        )
                except:
                    # coordinates are horribly wrong -> make simple warning message
                    message = "no proper stop codon (Exception)"
                # append warning to warnings list
                warn = IncompleteGeneStructureWarning(message)
                input[org]['warnings'].append( warn )

        # (D) check if start_codon is present as annotated track
        if _GENE_TRACKS_ARE_PROPERLY_MATCHED and GENE_HAS_PROPER_START_CODON and\
        input[org]['gff-gene'] and not filtergffs4fmethod( input[org]['gff-gene'],
        GFF_GENESTART_FMETHOD ):
            # create start_codon track
            # TODO: write a generic function that does the job,
            # TODO: whick takes splices start codons into account
            startcodon    = list(genecdstracks[0])
            startcodon[2] = GFF_GENESTART_FMETHOD
            startcodon[4] = startcodon[3]+2
            startcodon[5] = "."
            startcodon[7] = "."
            startcodon[8] = startcodon[8].split("; ")[0]
            input[org]['gff-gene'].append( tuple(startcodon) )
            ################################################################
            if verbose:
                sta,end  = genecdstracks[0][3]-1, genecdstracks[0][3]+2
                startdna = input[org]['genomeseq'][sta:end]
                print "CREATED:", genecdstracks[0][0:7]
                print "CREATED:", tuple(startcodon)
                print "CREATED:", sta, end, startdna, dna2protein(startdna,0)
            ################################################################

        # (E) check if stop_codon is present as annotated track
        if _GENE_TRACKS_ARE_PROPERLY_MATCHED and GENE_HAS_PROPER_STOP_CODON and\
        input[org]['gff-gene'] and not filtergffs4fmethod( input[org]['gff-gene'],
        GFF_GENESTOP_FMETHOD ):
            # create stop_codon track
            stopcodon    = list(genecdstracks[-1])
            stopcodon[2] = GFF_GENESTOP_FMETHOD
            stopcodon[4] = stopcodon[4]
            stopcodon[3] = stopcodon[4]-2 
            stopcodon[5] = "."
            stopcodon[7] = "."
            stopcodon[8] = stopcodon[8].split("; ")[0]
            input[org]['gff-gene'].append( tuple(stopcodon) )
            ################################################################
            if verbose:
                sta,end = genecdstracks[-1][4]-3, genecdstracks[-1][4]
                stopdna = input[org]['genomeseq'][sta:end]
                print "CREATED:", genecdstracks[-1][0:7]
                print "CREATED:", tuple(stopcodon)
                print "CREATED:", sta, end, stopdna, dna2protein(stopdna,0)
            ################################################################



        # (F) label the first & final (annotated) Orfs in the gene structure
        if GENE_HAS_PROPER_START_CODON and input[org]['orfid-genestructure']:
            firstorfid = input[org]['orfid-genestructure'][0]
            firstorf = input[org]['orfs'].get_orf_by_id(firstorfid)
            # add label to Orf object
            setattr(firstorf,FIRST_ANNOTATED_EXON_LABEL,True)
        if GENE_HAS_PROPER_STOP_CODON and input[org]['orfid-genestructure']:
            finalorfid = input[org]['orfid-genestructure'][-1]
            finalorf = input[org]['orfs'].get_orf_by_id(finalorfid)
            # add label to Orf object
            setattr(finalorf,FINAL_ANNOTATED_EXON_LABEL,True)


        # check if START & STOP codon are properly matched
        if not GENE_HAS_PROPER_START_CODON or not GENE_HAS_PROPER_STOP_CODON:
            _GENE_TRACKS_ARE_PROPERLY_MATCHED = False
            GENE_TRACKS_ARE_PROPERLY_MATCHED  = False


        # print the negative outcome in verbose mode
        if not _GENE_TRACKS_ARE_PROPERLY_MATCHED and verbose:
            # set known gene-tracks to empty list!
            input[org]['orfid-genestructure'] = []
            print "# WARNING: Gene is not mappable on Orfs:", org

    # return the incremented input dict and error status (True or False)
    return input, GENE_TRACKS_ARE_PROPERLY_MATCHED
def correct_unigene_for_utrs(
    unigene_gff_list,
    start_codon_gff=(),
    stop_codon_gff=(),
    minimal_likely_tss_pssm_score=3.0,
    shift_tss_pssm_score_ratio=4.0,
    dnaseqfname=None,
    verbose=False,
):
    """
    Check if unigene contains evidence for non-coding UTRs and if so, correct

    @type  unigene_gff_list: list
    @param unigene_gff_list: list with uncorrected unigene gff tuples

    @type  start_codon_gff: tuple
    @param start_codon_gff: tuple representing the (annotated) protein's start codon
    
    @type  stop_codon_gff: tuple
    @param stop_codon_gff: tuple representing the (annotated) protein's stop codon

    @type  dnaseqfname: string (or None)
    @param dnaseqfname: filename of DNA sequence corresponding to the unigene's GFF

    @type  minimal_likely_tss_pssm_score: float
    @param minimal_likely_tss_pssm_score: minimal (likely) PSSM score of the TSS

    @type  shift_tss_pssm_score_ratio: float
    @param shift_tss_pssm_score_ratio: shift TSS downstream when ratio is exceeded

    @type  verbose: Boolean
    @param verbose: print discrepancies to STDOUT (True) or not (False, default)

    @rtype:  list of (gff) tuples + typeofunigene string
    @return: list with corrected gff tuples + string

    @attention: Global variable GFF_UGEXON_FMETHOD  is required for this function
    @attention: Global variable GFF_UG3UTREXON_FMETHOD  is required for this function
    @attention: Global variable GFF_UG5UTREXON_FMETHOD  is required for this function
    """
    # return list with corrected unigene tracks
    return_unigene_gff_list = []
    start_codon_pos = None
    stop_codon_pos = None
    typeofunigene = None
    # make sets of unigene coordinates
    unigene_coordinate_set = gffs2coordset(unigene_gff_list, fmethod=[GFF_UGEXON_FMETHOD])

    if dnaseqfname:
        # print unigene structure annotation
        unigeneexons = filtergffs4fmethod(unigene_gff_list, GFF_UGEXON_FMETHOD)
        unigeneexons.sort()
        # replace fasta header for correct recognition
        # header,descr = parseSingleFastaHeaderFromFile(dnaseqfname)
        header, dnaseq, descr = parseSingleFasta(open(dnaseqfname).readlines())
        for i in range(0, len(unigeneexons)):
            gff = list(unigeneexons[i])
            gff[0] = header
            # correct for negative coordinate. This can happen in case
            # the unigene sticks out of the genelocus
            if gff[3] <= 0:
                gff[3] = 1
            unigeneexons[i] = tuple(gff)

        # run unigeneannotation command
        command = "%s %s %s" % (PYTHON_PATH, EXECUTABLE_UNIGENEANNOTATION, dnaseqfname)
        ci, co = os.popen2(command)
        ci.write(gffs2txt(unigeneexons))
        ci.close()
        ugannotation = co.read().strip().split("\t")
        co.close()
        typeofunigene = ugannotation[0]
        # abstract coordinates of start and stop codon
        # from unigene annotation
        try:
            start_codon_pos = int(ugannotation[5])
        except:
            start_codon_pos = None
        try:
            stop_codon_pos = int(ugannotation[6])
        except:
            stop_codon_pos = None

        ################################################################
        if verbose:
            for track in unigene_gff_list:
                print track
            print ugannotation, start_codon_pos, stop_codon_pos
            print "given ATG:", start_codon_gff
            print "given TGA:", stop_codon_gff
        ################################################################

        if start_codon_pos:
            # check if the PythonRegex obtained Methionine is the most
            # likely TSS. When a far better one is available -> shift
            # the TSS downstream (5p->3p) to this better TSS.
            startcodons = []
            for gffpos in range(start_codon_pos, int(unigeneexons[0][4]), 3):
                if dnaseq[gffpos - 1 : gffpos - 1 + 3].upper() == "ATG":
                    tssSta = gffpos - 1 - IC_TSS_PATTERN_OFFSET[0]
                    tssEnd = gffpos - 1 + 3 + IC_TSS_PATTERN_OFFSET[1]
                    tssSeq = dnaseq[tssSta:tssEnd]
                    tssSco = score_tss(tssSeq)
                    # print 'ATG', gffpos, "%1.2f" % tssSco
                    startcodons.append((tssSco, gffpos))
            # check if there are >1 start codon posibilities
            if len(startcodons) > 1 and startcodons[0][0] < minimal_likely_tss_pssm_score:
                for score, gffpos in startcodons[1:]:
                    if (
                        score >= minimal_likely_tss_pssm_score
                        and abs(score / startcodons[0][0]) > shift_tss_pssm_score_ratio
                    ):
                        start_codon_pos = gffpos
                        # print "TSS pos SHIFTED", startcodons[0][1], "->", gffpos
                        # break out after first shift; this is now *THE* TSS
                        break
        elif start_codon_gff:
            # unigene is a fragment or other transcript without
            # likely ATG. Fortunately, ATG is applied from the given
            # gene structure. Take this one.
            start_codon_pos = int(start_codon_gff[3])
        else:
            # NO start_codon_pos available -> unigene fragment!
            pass

    elif start_codon_gff or stop_codon_gff:
        typeofunigene = None  # unknown -> no unigeneannotation
        # No dna sequence is applied to verify the ATG/TGA
        # positions of the unigene by unigene annotation.
        # Abstract coordinates of start and/or stop codons
        # from the given coordinates (from the gene's annotation)
        if start_codon_gff:
            start_codon_pos = int(start_codon_gff[3])
        if stop_codon_gff:
            stop_codon_pos = int(stop_codon_gff[4])
    else:
        typeofunigene = None  # unknown -> no unigeneannotation
        ########################################################
        if verbose:
            print "NONE GIVEN seq/sta/end:", dnaseqfname
            print "gff ATG:", start_codon_gff
            print "gff TGA:", stop_codon_gff
        ########################################################
        # no anchors applied in terms of start/stop sites
        # TODO future update: find or predict the putative orf
        # of this unigene. That specific functionallity should
        # NOT be placed in this function!
        # for the time being, just return the input gff list.
        return unigene_gff_list, typeofunigene

    # create an unigene stop codon track when in unigene_coordinate_set
    if stop_codon_pos and stop_codon_pos in unigene_coordinate_set:
        # make a deepcopy of the first unigene exon track and make a list of it
        newgff = list(deepcopy(unigene_gff_list[0]))
        # update the coordinates
        newgff[2] = "UGstop"
        newgff[3] = stop_codon_pos - 2
        newgff[4] = stop_codon_pos
        return_unigene_gff_list.append(tuple(newgff))

    # CORRECT the unigene_coordinate_set for 5p nucleotides
    ignore_5p_coords = []
    if (
        start_codon_pos != None
        and start_codon_pos in unigene_coordinate_set
        and min(unigene_coordinate_set) < start_codon_pos
    ):
        if verbose:
            print "CREATE 5pUTR for ug:", typeofunigene
        # yes, there is a 5p unigene alignment part
        for coord in unigene_coordinate_set:
            if coord < start_codon_pos:
                # append to the ignore_5p_coords list
                ignore_5p_coords.append(coord)
        # remove from the unigene coord set
        for coord in ignore_5p_coords:
            unigene_coordinate_set.remove(coord)

    # CORRECT the unigene_coordinate_set for 3p nucleotides
    ignore_3p_coords = []
    if (
        stop_codon_pos != None
        and stop_codon_pos in unigene_coordinate_set
        and max(unigene_coordinate_set) > stop_codon_pos
    ):
        if verbose:
            print "CREATE 3pUTR for ug:", typeofunigene
        # yes, there is a 3p unigene alignment part
        for coord in unigene_coordinate_set:
            if coord > stop_codon_pos:
                # append to the ignore_5p_coords list
                ignore_3p_coords.append(coord)
        # remove from the unigene coord set
        for coord in ignore_3p_coords:
            unigene_coordinate_set.remove(coord)
        #### remove the stop codon position too
        ###unigene_coordinate_set.remove(stop_codon_pos-2)
        ###unigene_coordinate_set.remove(stop_codon_pos-1)
        ###unigene_coordinate_set.remove(stop_codon_pos)

    # make (new) UGExon tracks, corrected for UTRS, if needed
    if not (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set:
        # no utrs available; just set the input to the output list
        return_unigene_gff_list.extend(unigene_gff_list)

    elif (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set:
        # create new gff tracks for unigene exons
        unigene_exon_coords = list(unigene_coordinate_set)
        unigene_exon_coords.sort()
        track_coords = [[unigene_exon_coords[0]]]
        for coord in unigene_exon_coords[1:]:
            if coord == max(track_coords[-1]) + 1:
                track_coords[-1].append(coord)
            else:
                track_coords.append([coord])
        for track in track_coords:
            # make a deepcopy of the first unigene exon track and make a list of it
            newgff = list(deepcopy(unigene_gff_list[0]))
            # update the coordinates
            newgff[3] = min(track)
            newgff[4] = max(track)
            # and append to the new return unigene gff list
            return_unigene_gff_list.append(tuple(newgff))

        # make UTR5UGExon track if it exists
        if ignore_5p_coords:
            ignore_5p_coords.sort()
            tracks = [[ignore_5p_coords[0]]]
            for coord in ignore_5p_coords[1:]:
                if coord == max(tracks[-1]) + 1:
                    tracks[-1].append(coord)
                else:
                    tracks.append([coord])
            # reverse tracks; if there are >1, inserting in the
            # return list will guarantee the correct order
            tracks.reverse()
            for track in tracks:
                # make a deepcopy of the first unigene exon track and make a list of it
                newgff = list(deepcopy(unigene_gff_list[0]))
                # update the coordinates
                newgff[2] = GFF_UG5UTREXON_FMETHOD
                newgff[3] = min(track)
                newgff[4] = max(track)
                # and insert as the first the new return unigene gff list
                return_unigene_gff_list.insert(0, tuple(newgff))

        # make UTR3UGExon track
        if ignore_3p_coords:
            ignore_3p_coords.sort()
            tracks = [[ignore_3p_coords[0]]]
            for coord in ignore_3p_coords[1:]:
                if coord == max(tracks[-1]) + 1:
                    tracks[-1].append(coord)
                else:
                    tracks.append([coord])
            for track in tracks:
                # make a deepcopy of the first unigene exon track and make a list of it
                newgff = list(deepcopy(unigene_gff_list[0]))
                # update the coordinates
                newgff[2] = GFF_UG3UTREXON_FMETHOD
                newgff[3] = min(track)
                newgff[4] = max(track)
                # and append to the new return unigene gff list
                return_unigene_gff_list.append(tuple(newgff))

    else:
        # hmm... not really expected. There are UniGene tracks,
        # but no UniGene exons are recognized. Probably a wrong setting
        # applied for GFF_UGEXON_FMETHOD (not identical to the naming in
        # the input gff.
        pass

    # order the unigene gff list (stop codon potentially on the front
    return_unigene_gff_list = order_gff_list(return_unigene_gff_list)
    ################################################################
    if verbose and (ignore_5p_coords or ignore_3p_coords):
        for track in return_unigene_gff_list:
            print track
    ################################################################

    # done! return the new list
    return return_unigene_gff_list, typeofunigene
Esempio n. 6
0
def correct_unigene_for_utrs(unigene_gff_list,
                             start_codon_gff=(),
                             stop_codon_gff=(),
                             minimal_likely_tss_pssm_score=3.0,
                             shift_tss_pssm_score_ratio=4.0,
                             dnaseqfname=None,
                             verbose=False):
    """
    Check if unigene contains evidence for non-coding UTRs and if so, correct

    @type  unigene_gff_list: list
    @param unigene_gff_list: list with uncorrected unigene gff tuples

    @type  start_codon_gff: tuple
    @param start_codon_gff: tuple representing the (annotated) protein's start codon
    
    @type  stop_codon_gff: tuple
    @param stop_codon_gff: tuple representing the (annotated) protein's stop codon

    @type  dnaseqfname: string (or None)
    @param dnaseqfname: filename of DNA sequence corresponding to the unigene's GFF

    @type  minimal_likely_tss_pssm_score: float
    @param minimal_likely_tss_pssm_score: minimal (likely) PSSM score of the TSS

    @type  shift_tss_pssm_score_ratio: float
    @param shift_tss_pssm_score_ratio: shift TSS downstream when ratio is exceeded

    @type  verbose: Boolean
    @param verbose: print discrepancies to STDOUT (True) or not (False, default)

    @rtype:  list of (gff) tuples + typeofunigene string
    @return: list with corrected gff tuples + string

    @attention: Global variable GFF_UGEXON_FMETHOD  is required for this function
    @attention: Global variable GFF_UG3UTREXON_FMETHOD  is required for this function
    @attention: Global variable GFF_UG5UTREXON_FMETHOD  is required for this function
    """
    # return list with corrected unigene tracks
    return_unigene_gff_list = []
    start_codon_pos = None
    stop_codon_pos = None
    typeofunigene = None
    # make sets of unigene coordinates
    unigene_coordinate_set = gffs2coordset(unigene_gff_list,
                                           fmethod=[GFF_UGEXON_FMETHOD])

    if dnaseqfname:
        # print unigene structure annotation
        unigeneexons = filtergffs4fmethod(unigene_gff_list, GFF_UGEXON_FMETHOD)
        unigeneexons.sort()
        # replace fasta header for correct recognition
        #header,descr = parseSingleFastaHeaderFromFile(dnaseqfname)
        header, dnaseq, descr = parseSingleFasta(open(dnaseqfname).readlines())
        for i in range(0, len(unigeneexons)):
            gff = list(unigeneexons[i])
            gff[0] = header
            # correct for negative coordinate. This can happen in case
            # the unigene sticks out of the genelocus
            if gff[3] <= 0: gff[3] = 1
            unigeneexons[i] = tuple(gff)

        # run unigeneannotation command
        command = "%s %s %s" % (PYTHON_PATH, EXECUTABLE_UNIGENEANNOTATION,
                                dnaseqfname)
        ci, co = os.popen2(command)
        ci.write(gffs2txt(unigeneexons))
        ci.close()
        ugannotation = co.read().strip().split("\t")
        co.close()
        typeofunigene = ugannotation[0]
        # abstract coordinates of start and stop codon
        # from unigene annotation
        try:
            start_codon_pos = int(ugannotation[5])
        except:
            start_codon_pos = None
        try:
            stop_codon_pos = int(ugannotation[6])
        except:
            stop_codon_pos = None

        ################################################################
        if verbose:
            for track in unigene_gff_list:
                print track
            print ugannotation, start_codon_pos, stop_codon_pos
            print "given ATG:", start_codon_gff
            print "given TGA:", stop_codon_gff
        ################################################################

        if start_codon_pos:
            # check if the PythonRegex obtained Methionine is the most
            # likely TSS. When a far better one is available -> shift
            # the TSS downstream (5p->3p) to this better TSS.
            startcodons = []
            for gffpos in range(start_codon_pos, int(unigeneexons[0][4]), 3):
                if dnaseq[gffpos - 1:gffpos - 1 + 3].upper() == 'ATG':
                    tssSta = gffpos - 1 - IC_TSS_PATTERN_OFFSET[0]
                    tssEnd = gffpos - 1 + 3 + IC_TSS_PATTERN_OFFSET[1]
                    tssSeq = dnaseq[tssSta:tssEnd]
                    tssSco = score_tss(tssSeq)
                    #print 'ATG', gffpos, "%1.2f" % tssSco
                    startcodons.append((tssSco, gffpos))
            # check if there are >1 start codon posibilities
            if len(startcodons) > 1 and startcodons[0][0] <\
            minimal_likely_tss_pssm_score:
                for score, gffpos in startcodons[1:]:
                    if score >= minimal_likely_tss_pssm_score and\
                    abs( score / startcodons[0][0] ) > shift_tss_pssm_score_ratio:
                        start_codon_pos = gffpos
                        #print "TSS pos SHIFTED", startcodons[0][1], "->", gffpos
                        # break out after first shift; this is now *THE* TSS
                        break
        elif start_codon_gff:
            # unigene is a fragment or other transcript without
            # likely ATG. Fortunately, ATG is applied from the given
            # gene structure. Take this one.
            start_codon_pos = int(start_codon_gff[3])
        else:
            # NO start_codon_pos available -> unigene fragment!
            pass

    elif start_codon_gff or stop_codon_gff:
        typeofunigene = None  # unknown -> no unigeneannotation
        # No dna sequence is applied to verify the ATG/TGA
        # positions of the unigene by unigene annotation.
        # Abstract coordinates of start and/or stop codons
        # from the given coordinates (from the gene's annotation)
        if start_codon_gff:
            start_codon_pos = int(start_codon_gff[3])
        if stop_codon_gff:
            stop_codon_pos = int(stop_codon_gff[4])
    else:
        typeofunigene = None  # unknown -> no unigeneannotation
        ########################################################
        if verbose:
            print "NONE GIVEN seq/sta/end:", dnaseqfname
            print "gff ATG:", start_codon_gff
            print "gff TGA:", stop_codon_gff
        ########################################################
        # no anchors applied in terms of start/stop sites
        # TODO future update: find or predict the putative orf
        # of this unigene. That specific functionallity should
        # NOT be placed in this function!
        # for the time being, just return the input gff list.
        return unigene_gff_list, typeofunigene

    # create an unigene stop codon track when in unigene_coordinate_set
    if stop_codon_pos and stop_codon_pos in unigene_coordinate_set:
        # make a deepcopy of the first unigene exon track and make a list of it
        newgff = list(deepcopy(unigene_gff_list[0]))
        # update the coordinates
        newgff[2] = 'UGstop'
        newgff[3] = stop_codon_pos - 2
        newgff[4] = stop_codon_pos
        return_unigene_gff_list.append(tuple(newgff))

    # CORRECT the unigene_coordinate_set for 5p nucleotides
    ignore_5p_coords = []
    if start_codon_pos != None and start_codon_pos in\
    unigene_coordinate_set and min(unigene_coordinate_set) < start_codon_pos:
        if verbose: print "CREATE 5pUTR for ug:", typeofunigene
        # yes, there is a 5p unigene alignment part
        for coord in unigene_coordinate_set:
            if coord < start_codon_pos:
                # append to the ignore_5p_coords list
                ignore_5p_coords.append(coord)
        # remove from the unigene coord set
        for coord in ignore_5p_coords:
            unigene_coordinate_set.remove(coord)

    # CORRECT the unigene_coordinate_set for 3p nucleotides
    ignore_3p_coords = []
    if stop_codon_pos != None and stop_codon_pos in\
    unigene_coordinate_set and max(unigene_coordinate_set) > stop_codon_pos:
        if verbose: print "CREATE 3pUTR for ug:", typeofunigene
        # yes, there is a 3p unigene alignment part
        for coord in unigene_coordinate_set:
            if coord > stop_codon_pos:
                # append to the ignore_5p_coords list
                ignore_3p_coords.append(coord)
        # remove from the unigene coord set
        for coord in ignore_3p_coords:
            unigene_coordinate_set.remove(coord)
        #### remove the stop codon position too
        ###unigene_coordinate_set.remove(stop_codon_pos-2)
        ###unigene_coordinate_set.remove(stop_codon_pos-1)
        ###unigene_coordinate_set.remove(stop_codon_pos)

    # make (new) UGExon tracks, corrected for UTRS, if needed
    if not (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set:
        # no utrs available; just set the input to the output list
        return_unigene_gff_list.extend(unigene_gff_list)

    elif (ignore_5p_coords or ignore_3p_coords) and unigene_coordinate_set:
        # create new gff tracks for unigene exons
        unigene_exon_coords = list(unigene_coordinate_set)
        unigene_exon_coords.sort()
        track_coords = [[unigene_exon_coords[0]]]
        for coord in unigene_exon_coords[1:]:
            if coord == max(track_coords[-1]) + 1:
                track_coords[-1].append(coord)
            else:
                track_coords.append([coord])
        for track in track_coords:
            # make a deepcopy of the first unigene exon track and make a list of it
            newgff = list(deepcopy(unigene_gff_list[0]))
            # update the coordinates
            newgff[3] = min(track)
            newgff[4] = max(track)
            # and append to the new return unigene gff list
            return_unigene_gff_list.append(tuple(newgff))

        # make UTR5UGExon track if it exists
        if ignore_5p_coords:
            ignore_5p_coords.sort()
            tracks = [[ignore_5p_coords[0]]]
            for coord in ignore_5p_coords[1:]:
                if coord == max(tracks[-1]) + 1:
                    tracks[-1].append(coord)
                else:
                    tracks.append([coord])
            # reverse tracks; if there are >1, inserting in the
            # return list will guarantee the correct order
            tracks.reverse()
            for track in tracks:
                # make a deepcopy of the first unigene exon track and make a list of it
                newgff = list(deepcopy(unigene_gff_list[0]))
                # update the coordinates
                newgff[2] = GFF_UG5UTREXON_FMETHOD
                newgff[3] = min(track)
                newgff[4] = max(track)
                # and insert as the first the new return unigene gff list
                return_unigene_gff_list.insert(0, tuple(newgff))

        # make UTR3UGExon track
        if ignore_3p_coords:
            ignore_3p_coords.sort()
            tracks = [[ignore_3p_coords[0]]]
            for coord in ignore_3p_coords[1:]:
                if coord == max(tracks[-1]) + 1:
                    tracks[-1].append(coord)
                else:
                    tracks.append([coord])
            for track in tracks:
                # make a deepcopy of the first unigene exon track and make a list of it
                newgff = list(deepcopy(unigene_gff_list[0]))
                # update the coordinates
                newgff[2] = GFF_UG3UTREXON_FMETHOD
                newgff[3] = min(track)
                newgff[4] = max(track)
                # and append to the new return unigene gff list
                return_unigene_gff_list.append(tuple(newgff))

    else:
        # hmm... not really expected. There are UniGene tracks,
        # but no UniGene exons are recognized. Probably a wrong setting
        # applied for GFF_UGEXON_FMETHOD (not identical to the naming in
        # the input gff.
        pass

    # order the unigene gff list (stop codon potentially on the front
    return_unigene_gff_list = order_gff_list(return_unigene_gff_list)
    ################################################################
    if verbose and (ignore_5p_coords or ignore_3p_coords):
        for track in return_unigene_gff_list:
            print track
    ################################################################

    # done! return the new list
    return return_unigene_gff_list, typeofunigene