Example #1
0
def PCG2similarityarray(PCG,organism,aalength,omit_unigenes=True):
    """ """
    array_algsimilarity = zeros(aalength)
    for orgS in PCG.organism_set():
        if organism == orgS: continue
        pacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(organism,orgS))
        if pacbporfs and omit_unigenes and hasattr(pacbporfs[0].orfS,ORF_IS_UNIGENE_LABEL):
            continue
        orgSimArray = pacbporflist2similarityarray(pacbporfs,"query",aalength)
        array_algsimilarity+=orgSimArray

    # return similarity array
    return array_algsimilarity
Example #2
0
def PCG2codingarray(PCG, organism, aalength, omit_unigenes=True):
    """ """
    array_algpresence = zeros(aalength)
    for orgS in PCG.organism_set():
        if organism == orgS: continue
        pacbporfs = order_pacbporf_list(
            PCG.get_pacbps_by_organisms(organism, orgS))
        if pacbporfs and omit_unigenes and hasattr(pacbporfs[0].orfS,
                                                   ORF_IS_UNIGENE_LABEL):
            continue
        orgPresArray = pacbporflist2codingarray(pacbporfs, "query", aalength)
        array_algpresence += orgPresArray

    # return coding/presence array
    return array_algpresence
Example #3
0
def _convert_tinyexon_proteinmatches_to_pacbporfs(target,
                                                  protmatches,
                                                  tinyexondata,
                                                  PCG,
                                                  min_discovery_count=2):
    """  """
    target_tinyexon_pacbporf_data = {}
    # fish these protein matches from the tinyexons and convert to PacbPORFs
    for informant in tinyexondata.keys():
        if informant == target: continue
        thepacbporfs = order_pacbporf_list(
            PCG.get_pacbps_by_organisms(target, informant))
        for exonQ in tinyexondata[target]:
            if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue
            if exonQ.proteinsequence() not in protmatches.keys(): continue
            for exonS in tinyexondata[informant]:
                if exonS.length > exonQ.length: break
                if exonS.proteinsequence() not in protmatches.keys(): continue
                # omit non-identical exons
                if not _are_tinyexons_similar(exonQ, exonS): continue
                # if here: similar exons. make PacbPORF
                pacbporf = exononorfs2pacbporf(exonQ,
                                               exonS,
                                               matrix=TINYEXON_MATRIX)
                if not pacbporf: continue
                # check if placeable in PCG/pacbporflist
                rejected = [
                    pf.is_postioned_compatibly(pacbporf) for pf in thepacbporfs
                ].count(False) > 0

                # label pacbporf as found by tinyexon PP
                pacbporf._tinyexon_label = "PP"

                # store to target_tinyexon_pacbporf_data
                key = (exonQ.proteinsequence(), exonQ.start)
                _update_tinyexon_pacbporf_dict(target_tinyexon_pacbporf_data,
                                               key, pacbporf, rejected,
                                               informant)

    # cleanup tinyexon protein matches that have been observed to litte
    _remove_dict_elements_with_short_value_list(
        target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count)

    # return target_tinyexon_pacbporf_data
    return target_tinyexon_pacbporf_data
Example #4
0
def _has_pp_tinyexonpacbporf_perfect_introns(tinyexonPF,target,informant,PCG):
    """ """
    # check if a (perfect) introns can be mapped
    is_confirmed_with_introns = False

    if tinyexonPF._tinyexon_label != 'PP': return False

    # get ordered PacbPORFS for this informant
    thepacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(target,informant))

    for pos in range(1,len(thepacbporfs)):
        prevPF,nextPF = thepacbporfs[pos-1],thepacbporfs[pos]
        if prevPF.distance_towards(tinyexonPF) > 0 and\
        tinyexonPF.distance_towards(nextPF) > 0:
            intronsPREV = merge_pacbporfs_with_introns(
                    prevPF,tinyexonPF,max_aa_offset=0,
                    max_intron_nt_length=None)
            intronsNEXT = merge_pacbporfs_with_introns(
                    tinyexonPF,nextPF,max_aa_offset=0,
                    max_intron_nt_length=None)
            if len(intronsPREV) >= 1 and len(intronsNEXT) >= 1:
                perfect_prev_intron = False
                perfect_next_intron = False
                for intronQ,intronS in intronsPREV:
                    intronQ.assign_bp_and_ppts()
                    intronS.assign_bp_and_ppts()
                    if intronQ.branchpoint and intronS.branchpoint:
                        perfect_prev_intron = True
                        break
                for intronQ,intronS in intronsNEXT:
                    intronQ.assign_bp_and_ppts()
                    intronS.assign_bp_and_ppts()
                    if intronQ.branchpoint and intronS.branchpoint:
                        perfect_next_intron = True
                        break
                # check if both intron options have a perfect candidate
                if perfect_prev_intron and perfect_next_intron:
                    is_confirmed_with_introns = True
            # break out
            break

    # return is_confirmed_with_introns status
    return is_confirmed_with_introns
Example #5
0
def _has_pp_tinyexonpacbporf_perfect_introns(tinyexonPF, target, informant,
                                             PCG):
    """ """
    # check if a (perfect) introns can be mapped
    is_confirmed_with_introns = False

    if tinyexonPF._tinyexon_label != 'PP': return False

    # get ordered PacbPORFS for this informant
    thepacbporfs = order_pacbporf_list(
        PCG.get_pacbps_by_organisms(target, informant))

    for pos in range(1, len(thepacbporfs)):
        prevPF, nextPF = thepacbporfs[pos - 1], thepacbporfs[pos]
        if prevPF.distance_towards(tinyexonPF) > 0 and\
        tinyexonPF.distance_towards(nextPF) > 0:
            intronsPREV = merge_pacbporfs_with_introns(
                prevPF, tinyexonPF, max_aa_offset=0, max_intron_nt_length=None)
            intronsNEXT = merge_pacbporfs_with_introns(
                tinyexonPF, nextPF, max_aa_offset=0, max_intron_nt_length=None)
            if len(intronsPREV) >= 1 and len(intronsNEXT) >= 1:
                perfect_prev_intron = False
                perfect_next_intron = False
                for intronQ, intronS in intronsPREV:
                    intronQ.assign_bp_and_ppts()
                    intronS.assign_bp_and_ppts()
                    if intronQ.branchpoint and intronS.branchpoint:
                        perfect_prev_intron = True
                        break
                for intronQ, intronS in intronsNEXT:
                    intronQ.assign_bp_and_ppts()
                    intronS.assign_bp_and_ppts()
                    if intronQ.branchpoint and intronS.branchpoint:
                        perfect_next_intron = True
                        break
                # check if both intron options have a perfect candidate
                if perfect_prev_intron and perfect_next_intron:
                    is_confirmed_with_introns = True
            # break out
            break

    # return is_confirmed_with_introns status
    return is_confirmed_with_introns
Example #6
0
def _convert_tinyexon_proteinmatches_to_pacbporfs(target,protmatches,
    tinyexondata,PCG,min_discovery_count=2):
    """  """
    target_tinyexon_pacbporf_data = {}
    # fish these protein matches from the tinyexons and convert to PacbPORFs
    for informant in tinyexondata.keys():
        if informant == target: continue
        thepacbporfs = order_pacbporf_list(
                PCG.get_pacbps_by_organisms(target,informant))
        for exonQ in tinyexondata[target]:
            if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue
            if exonQ.proteinsequence() not in protmatches.keys(): continue
            for exonS in tinyexondata[informant]:
                if exonS.length > exonQ.length: break
                if exonS.proteinsequence() not in protmatches.keys(): continue
                # omit non-identical exons
                if not _are_tinyexons_similar(exonQ,exonS): continue
                # if here: similar exons. make PacbPORF
                pacbporf = exononorfs2pacbporf(exonQ,exonS,matrix=TINYEXON_MATRIX)
                if not pacbporf: continue
                # check if placeable in PCG/pacbporflist
                rejected = [ pf.is_postioned_compatibly(pacbporf) for pf in thepacbporfs ].count(False) > 0

                # label pacbporf as found by tinyexon PP
                pacbporf._tinyexon_label = "PP"

                # store to target_tinyexon_pacbporf_data
                key = (exonQ.proteinsequence(),exonQ.start)
                _update_tinyexon_pacbporf_dict(target_tinyexon_pacbporf_data,
                    key,pacbporf,rejected,informant)

    # cleanup tinyexon protein matches that have been observed to litte
    _remove_dict_elements_with_short_value_list(
            target_tinyexon_pacbporf_data,
            min_value_list_size=min_discovery_count)

    # return target_tinyexon_pacbporf_data
    return target_tinyexon_pacbporf_data
Example #7
0
def update_PCG_with_signalpexons(signalpexonseqs,
                                 PCG,
                                 OPTIONS,
                                 min_pacbporf_identityscore=0.20,
                                 verbose=True):
    """ """
    if not signalpexonseqs.has_key(OPTIONS.target): return False
    is_any_pacbporf_added = False
    for targetSPexon in signalpexonseqs[OPTIONS.target]:
        target = OPTIONS.target
        for informant, infSPlist in signalpexonseqs.iteritems():
            if informant == OPTIONS.target: continue
            # check if informant has been deleted in the meanwhile
            if informant not in PCG.organism_set(): continue
            # list to store signalp exons into
            signalpexon_pacbp_list = []
            # get ordered pacbporfs fromt he PCG
            thepacbporfs = order_pacbporf_list(
                PCG.get_pacbps_by_organisms(OPTIONS.target, informant))
            if not thepacbporfs:
                # no alignments present for this organism (can happen!)
                continue
            for informantSPexon in infSPlist:
                coords = [
                    targetSPexon.protein_start(),
                    targetSPexon.protein_end(),
                    informantSPexon.protein_start(),
                    informantSPexon.protein_end(),
                ]

                # prior to making ClustalW-PacbP, check PacbPCOORD placeability
                # into the list of pacbporfs
                pacbpCoordsObj = PacbPCOORDS(input=(
                    targetSPexon.proteinsequence(),
                    informantSPexon.proteinsequence(),
                    targetSPexon.protein_start(),
                    informantSPexon.protein_start(),
                ))

                if False in [
                        pacbpCoordsObj.is_positioned_compatibly(pacbporf)
                        for pacbporf in thepacbporfs
                ]:
                    # *NOT* placable in current ordered list of PacbPORFS
                    continue

                dist = pacbpCoordsObj.distance_towards(thepacbporfs[0])
                if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH / 3:
                    # WAY TO FAR in front of current gene structure parts.
                    # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS
                    continue
                elif dist == 0:
                    # NOT placeable in front of the rest of the PacbPORFS.
                    continue
                else:
                    pass

                    # perform ClustalW alignment on the SP exons
                    (alignedseqs,alignment) =\
                clustalw( seqs= {
                    OPTIONS.target: targetSPexon.proteinsequence(),
                    informant: informantSPexon.proteinsequence() } )

                # make pacbp from clustalw alignment
                pacbp = pacbp_from_clustalw(
                    alignment=(alignedseqs[OPTIONS.target], alignment,
                               alignedseqs[informant]),
                    coords=coords)

                # is there any alignment constructed?
                if not pacbp: continue

                # ignore (very) poor identyscore alignments
                if pacbp.identityscore < min_pacbporf_identityscore: continue

                # if here make extended pacbpORF
                signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf,
                                                     informantSPexon.orf)
                signalpexonPacbpORF.extend_pacbporf_after_stops()
                # and store in signalpexon_pacbp_list
                signalpexon_pacbp_list.append(signalpexonPacbpORF)

                ################################################################
                if verbose:
                    print alignedseqs[OPTIONS.target], OPTIONS.target
                    print alignment
                    print alignedseqs[informant], informant
                    if pacbp:
                        print pacbp, (OPTIONS.target, targetSPexon.orf.id),
                        print(informant, informantSPexon.orf.id),
                        print "DISTANCE::", dist
                        pacbp.print_protein()
                        print ""
                ################################################################

            # If there are signalpexon-guided pacbporfs found, store the one
            # with the highest bitscore
            if signalpexon_pacbp_list:
                signalpexon_pacbp_list = order_list_by_attribute(
                    signalpexon_pacbp_list, order_by='bits', reversed=True)
                # store best bitscoring pacbporf to PCG
                signalp_pacbporf = signalpexon_pacbp_list[0]
                pacbporf2PCG(signalp_pacbporf,
                             OPTIONS.target,
                             informant,
                             PCG,
                             source='SignalP-ClustalW')
                is_any_pacbporf_added = True
                ####################################################################
                if verbose:
                    print "SignalP Exon added to PCG:", signalp_pacbporf, informant
                ####################################################################
            else:
                pass

    # return pointer is_any_pacbporf_added
    return is_any_pacbporf_added
Example #8
0
def _find_qq_tinyexons_as_pacbporfs(target,tinyexondata,PCG,min_discovery_count=2):
    """ """
    target_tinyexon_pacbporf_data = {}
    for informant in tinyexondata.keys():
        if informant == target: continue
        thepacbporfs = order_pacbporf_list(
                PCG.get_pacbps_by_organisms(target,informant))
        for exonQ in tinyexondata[target]:
            if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue
            for (prevpos,nextpos) in [ (pos-1,pos) for pos in range(1,len(thepacbporfs)) ]:
                prevPF = thepacbporfs[prevpos]
                nextPF = thepacbporfs[nextpos]
                if prevPF.orfS.id == nextPF.orfS.id:

                    # check if PacbPORFs are positioned more or less okay
                    if prevPF.distance_towards(nextPF) > 20: continue

                    # check if exonQ is positioned ~between these PacbPORFs
                    if exonQ.orf.dnapos2aapos(exonQ.end) < max(prevPF.alignment_protein_range_query())-12:
                        continue
                    if exonQ.orf.dnapos2aapos(exonQ.start) > min(nextPF.alignment_protein_range_query())+12:
                        continue

                    # check if gap can be projected already by a perfect intron
                    introns = merge_pacbporfs_by_intron_in_query(
                                prevPF,nextPF,max_aa_offset=1)
                    # if introns found => continue
                    if introns: continue

                    # orfObj is the orfS of prevPF or nextPF (just take any)
                    orfObj = prevPF.orfS
                    # assign elegiable range of tinyexon match on SBJCT
                    aapos_sbjct_range = range(
                            max(prevPF.alignment_protein_range_sbjct())-12,
                            min(nextPF.alignment_protein_range_sbjct())+12
                            )

                    tinyexonmatches = _find_match_on_orfobj(exonQ,orfObj)
                    for (aaseq,aapos) in tinyexonmatches:
                        # check if the match is obtained in the expected
                        # sbjct AA range; if not, ignore the match
                        if aapos not in aapos_sbjct_range: continue

                        # make pacbporf object
                        pacbpobj = PacbP(input=(
                                exonQ.proteinsequence(), aaseq,
                                exonQ.orf.dnapos2aapos(exonQ.start), aapos ) )
                        pacbporfobj = pacbp2pacbporf(pacbpobj,exonQ.orf,orfObj)
                        pacbporfobj.extend_pacbporf_after_stops()
        
                        # remove included pacbporfs
                        is_suborsuperset = False
                        for accepted_pacbporf in thepacbporfs:
                            if pacbporfobj.issubsetorsuperset(accepted_pacbporf):
                                is_suborsuperset = True
                                break
                        if is_suborsuperset:
                            continue
    

                        # check if 2 (perfect) introns can be projected
                        introns5p = merge_pacbporfs_by_intron_in_query(
                                prevPF,pacbporfobj,
                                max_aa_offset=1,
                                max_intron_nt_length=None)
                                #max_intron_nt_length=140)
                        introns3p = merge_pacbporfs_by_intron_in_query(
                                pacbporfobj,nextPF,
                                max_aa_offset=1,
                                max_intron_nt_length=None)
                                #max_intron_nt_length=140)

                        # continue if not is_confirmed_by_intron_projection
                        if not introns5p or not introns3p: continue
    
                        # check if placeable in PCG/pacbporflist
                        distPrev = prevPF.distance_towards(pacbporfobj)
                        distNext = pacbporfobj.distance_towards(nextPF)
                        ovrlPrev = pacbporfobj.overlap(prevPF)
                        ovrlNext = pacbporfobj.overlap(nextPF)
                        if distPrev and distNext:
                            rejected = False
                        elif not distPrev and ovrlPrev:
                            rejected = False
                        elif not distNext and ovrlNext:
                            rejected = False
                        elif ovrlPrev and ovrlNext:
                            rejected = False
                        else:
                            rejected = True

                        print "OKAY", exonQ.proteinsequence(), aaseq, rejected, informant, (distPrev,distNext,ovrlPrev,ovrlNext)

                        # label pacbporf as found by tinyexon QQ
                        pacbporfobj._tinyexon_label = "QQ"

                        # store to target_tinyexon_pacbporf_data
                        key = (exonQ.proteinsequence(),exonQ.start)
                        _update_tinyexon_pacbporf_dict(
                                target_tinyexon_pacbporf_data,
                                key,pacbporfobj,rejected,informant)


    # cleanup tinyexon protein matches that have been observed to litte
    _remove_dict_elements_with_short_value_list(
            target_tinyexon_pacbporf_data,
            min_value_list_size=min_discovery_count)

    # return target_tinyexon_pacbporf_data
    return target_tinyexon_pacbporf_data
Example #9
0
def _find_qp_and_pq_tinyexons_as_pacbporfs(target,tinyexondata,PCG,min_discovery_count=2):
    """ """
    target_tinyexon_pacbporf_data = {}
    for informant in tinyexondata.keys():
        if informant == target: continue
        thepacbporfs = order_pacbporf_list(
                PCG.get_pacbps_by_organisms(target,informant))
        for exonQ in tinyexondata[target]:
            if exonQ.orf.id in [ pf.orfQ.id for pf in thepacbporfs ]: continue
            for orfObj in PCG.get_orfs_of_graph(organism=informant):
                tinyexonmatches = _find_qp_or_pq_match_on_orfobj(exonQ,orfObj)
                for (aaseq,aapos) in tinyexonmatches:
                    # make pacbporf object
                    pacbpobj = PacbP(input=(
                            exonQ.proteinsequence(), aaseq,
                            exonQ.orf.dnapos2aapos(exonQ.start), aapos ) )
                    pacbporfobj = pacbp2pacbporf(pacbpobj,exonQ.orf,orfObj)
                    pacbporfobj.extend_pacbporf_after_stops()
    
                    # remove included pacbporfs
                    is_suborsuperset = False
                    for accepted_pacbporf in thepacbporfs:
                        if pacbporfobj.issubsetorsuperset(accepted_pacbporf):
                            is_suborsuperset = True
                            break
                    if is_suborsuperset:
                        continue

                    # check if a (perfect) intron can be projected
                    is_confirmed_by_intron_projection = False
                    for accepted_pacbporf in thepacbporfs:
                        if accepted_pacbporf.orfS.id == pacbporfobj.orfS.id:
                            if min(accepted_pacbporf.alignment_dna_range_query()) > min(pacbporfobj.alignment_dna_range_query()):
                                try:
                                    introns = merge_pacbporfs_by_intron_in_query(
                                        pacbporfobj,accepted_pacbporf,
                                        max_aa_offset=0,
                                        max_intron_nt_length=None)
                                        #max_intron_nt_length=140)
                                except IndexError:
                                    # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query
                                    introns = []

                            else:
                                try:
                                    introns = merge_pacbporfs_by_intron_in_query(
                                        accepted_pacbporf,pacbporfobj,
                                        max_aa_offset=0,
                                        max_intron_nt_length=None)
                                        #max_intron_nt_length=140)
                                except IndexError:
                                    # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query
                                    introns = []

                            if len(introns) >= 1:
                                is_confirmed_by_intron_projection = True
                                break

                    # continue if not is_confirmed_by_intron_projection
                    if not is_confirmed_by_intron_projection: continue

                    # check if placeable in PCG/pacbporflist
                    rejected = [ pf.is_postioned_compatibly(pacbporfobj) for pf in thepacbporfs ].count(False) > 0

                    # label pacbporf as found by tinyexon QP
                    pacbporfobj._tinyexon_label = "QP"

                    # store to target_tinyexon_pacbporf_data
                    key = (exonQ.proteinsequence(),exonQ.start)
                    _update_tinyexon_pacbporf_dict(
                            target_tinyexon_pacbporf_data,
                            key,pacbporfobj,rejected,informant)


    # cleanup tinyexon protein matches that have been observed to litte
    _remove_dict_elements_with_short_value_list(
            target_tinyexon_pacbporf_data,
            min_value_list_size=min_discovery_count)

    # return target_tinyexon_pacbporf_data
    return target_tinyexon_pacbporf_data
Example #10
0
def _find_qq_tinyexons_as_pacbporfs(target,
                                    tinyexondata,
                                    PCG,
                                    min_discovery_count=2):
    """ """
    target_tinyexon_pacbporf_data = {}
    for informant in tinyexondata.keys():
        if informant == target: continue
        thepacbporfs = order_pacbporf_list(
            PCG.get_pacbps_by_organisms(target, informant))
        for exonQ in tinyexondata[target]:
            if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue
            for (prevpos, nextpos) in [(pos - 1, pos)
                                       for pos in range(1, len(thepacbporfs))]:
                prevPF = thepacbporfs[prevpos]
                nextPF = thepacbporfs[nextpos]
                if prevPF.orfS.id == nextPF.orfS.id:

                    # check if PacbPORFs are positioned more or less okay
                    if prevPF.distance_towards(nextPF) > 20: continue

                    # check if exonQ is positioned ~between these PacbPORFs
                    if exonQ.orf.dnapos2aapos(exonQ.end) < max(
                            prevPF.alignment_protein_range_query()) - 12:
                        continue
                    if exonQ.orf.dnapos2aapos(exonQ.start) > min(
                            nextPF.alignment_protein_range_query()) + 12:
                        continue

                    # check if gap can be projected already by a perfect intron
                    introns = merge_pacbporfs_by_intron_in_query(
                        prevPF, nextPF, max_aa_offset=1)
                    # if introns found => continue
                    if introns: continue

                    # orfObj is the orfS of prevPF or nextPF (just take any)
                    orfObj = prevPF.orfS
                    # assign elegiable range of tinyexon match on SBJCT
                    aapos_sbjct_range = range(
                        max(prevPF.alignment_protein_range_sbjct()) - 12,
                        min(nextPF.alignment_protein_range_sbjct()) + 12)

                    tinyexonmatches = _find_match_on_orfobj(exonQ, orfObj)
                    for (aaseq, aapos) in tinyexonmatches:
                        # check if the match is obtained in the expected
                        # sbjct AA range; if not, ignore the match
                        if aapos not in aapos_sbjct_range: continue

                        # make pacbporf object
                        pacbpobj = PacbP(
                            input=(exonQ.proteinsequence(), aaseq,
                                   exonQ.orf.dnapos2aapos(exonQ.start), aapos))
                        pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf,
                                                     orfObj)
                        pacbporfobj.extend_pacbporf_after_stops()

                        # remove included pacbporfs
                        is_suborsuperset = False
                        for accepted_pacbporf in thepacbporfs:
                            if pacbporfobj.issubsetorsuperset(
                                    accepted_pacbporf):
                                is_suborsuperset = True
                                break
                        if is_suborsuperset:
                            continue

                        # check if 2 (perfect) introns can be projected
                        introns5p = merge_pacbporfs_by_intron_in_query(
                            prevPF,
                            pacbporfobj,
                            max_aa_offset=1,
                            max_intron_nt_length=None)
                        #max_intron_nt_length=140)
                        introns3p = merge_pacbporfs_by_intron_in_query(
                            pacbporfobj,
                            nextPF,
                            max_aa_offset=1,
                            max_intron_nt_length=None)
                        #max_intron_nt_length=140)

                        # continue if not is_confirmed_by_intron_projection
                        if not introns5p or not introns3p: continue

                        # check if placeable in PCG/pacbporflist
                        distPrev = prevPF.distance_towards(pacbporfobj)
                        distNext = pacbporfobj.distance_towards(nextPF)
                        ovrlPrev = pacbporfobj.overlap(prevPF)
                        ovrlNext = pacbporfobj.overlap(nextPF)
                        if distPrev and distNext:
                            rejected = False
                        elif not distPrev and ovrlPrev:
                            rejected = False
                        elif not distNext and ovrlNext:
                            rejected = False
                        elif ovrlPrev and ovrlNext:
                            rejected = False
                        else:
                            rejected = True

                        print "OKAY", exonQ.proteinsequence(
                        ), aaseq, rejected, informant, (distPrev, distNext,
                                                        ovrlPrev, ovrlNext)

                        # label pacbporf as found by tinyexon QQ
                        pacbporfobj._tinyexon_label = "QQ"

                        # store to target_tinyexon_pacbporf_data
                        key = (exonQ.proteinsequence(), exonQ.start)
                        _update_tinyexon_pacbporf_dict(
                            target_tinyexon_pacbporf_data, key, pacbporfobj,
                            rejected, informant)

    # cleanup tinyexon protein matches that have been observed to litte
    _remove_dict_elements_with_short_value_list(
        target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count)

    # return target_tinyexon_pacbporf_data
    return target_tinyexon_pacbporf_data
Example #11
0
def _find_qp_and_pq_tinyexons_as_pacbporfs(target,
                                           tinyexondata,
                                           PCG,
                                           min_discovery_count=2):
    """ """
    target_tinyexon_pacbporf_data = {}
    for informant in tinyexondata.keys():
        if informant == target: continue
        thepacbporfs = order_pacbporf_list(
            PCG.get_pacbps_by_organisms(target, informant))
        for exonQ in tinyexondata[target]:
            if exonQ.orf.id in [pf.orfQ.id for pf in thepacbporfs]: continue
            for orfObj in PCG.get_orfs_of_graph(organism=informant):
                tinyexonmatches = _find_qp_or_pq_match_on_orfobj(exonQ, orfObj)
                for (aaseq, aapos) in tinyexonmatches:
                    # make pacbporf object
                    pacbpobj = PacbP(
                        input=(exonQ.proteinsequence(), aaseq,
                               exonQ.orf.dnapos2aapos(exonQ.start), aapos))
                    pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf, orfObj)
                    pacbporfobj.extend_pacbporf_after_stops()

                    # remove included pacbporfs
                    is_suborsuperset = False
                    for accepted_pacbporf in thepacbporfs:
                        if pacbporfobj.issubsetorsuperset(accepted_pacbporf):
                            is_suborsuperset = True
                            break
                    if is_suborsuperset:
                        continue

                    # check if a (perfect) intron can be projected
                    is_confirmed_by_intron_projection = False
                    for accepted_pacbporf in thepacbporfs:
                        if accepted_pacbporf.orfS.id == pacbporfobj.orfS.id:
                            if min(accepted_pacbporf.alignment_dna_range_query(
                            )) > min(pacbporfobj.alignment_dna_range_query()):
                                try:
                                    introns = merge_pacbporfs_by_intron_in_query(
                                        pacbporfobj,
                                        accepted_pacbporf,
                                        max_aa_offset=0,
                                        max_intron_nt_length=None)
                                    #max_intron_nt_length=140)
                                except IndexError:
                                    # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query
                                    introns = []

                            else:
                                try:
                                    introns = merge_pacbporfs_by_intron_in_query(
                                        accepted_pacbporf,
                                        pacbporfobj,
                                        max_aa_offset=0,
                                        max_intron_nt_length=None)
                                    #max_intron_nt_length=140)
                                except IndexError:
                                    # unexpected event: TODO: solve in merge_pacbporfs_by_intron_in_query
                                    introns = []

                            if len(introns) >= 1:
                                is_confirmed_by_intron_projection = True
                                break

                    # continue if not is_confirmed_by_intron_projection
                    if not is_confirmed_by_intron_projection: continue

                    # check if placeable in PCG/pacbporflist
                    rejected = [
                        pf.is_postioned_compatibly(pacbporfobj)
                        for pf in thepacbporfs
                    ].count(False) > 0

                    # label pacbporf as found by tinyexon QP
                    pacbporfobj._tinyexon_label = "QP"

                    # store to target_tinyexon_pacbporf_data
                    key = (exonQ.proteinsequence(), exonQ.start)
                    _update_tinyexon_pacbporf_dict(
                        target_tinyexon_pacbporf_data, key, pacbporfobj,
                        rejected, informant)

    # cleanup tinyexon protein matches that have been observed to litte
    _remove_dict_elements_with_short_value_list(
        target_tinyexon_pacbporf_data, min_value_list_size=min_discovery_count)

    # return target_tinyexon_pacbporf_data
    return target_tinyexon_pacbporf_data
Example #12
0
def update_PCG_with_signalpexons(signalpexonseqs,PCG,OPTIONS,
    min_pacbporf_identityscore=0.20,verbose=True):
    """ """
    if not signalpexonseqs.has_key(OPTIONS.target): return False
    is_any_pacbporf_added = False
    for targetSPexon in signalpexonseqs[OPTIONS.target]:
        target = OPTIONS.target
        for informant,infSPlist in signalpexonseqs.iteritems():
            if informant == OPTIONS.target: continue
            # check if informant has been deleted in the meanwhile
            if informant not in PCG.organism_set(): continue
            # list to store signalp exons into
            signalpexon_pacbp_list = []
            # get ordered pacbporfs fromt he PCG
            thepacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(OPTIONS.target,informant))
            if not thepacbporfs:
                # no alignments present for this organism (can happen!)
                continue
            for informantSPexon in infSPlist:
                coords  = [ targetSPexon.protein_start(),
                            targetSPexon.protein_end(),
                            informantSPexon.protein_start(),
                            informantSPexon.protein_end(), ]

                # prior to making ClustalW-PacbP, check PacbPCOORD placeability
                # into the list of pacbporfs
                pacbpCoordsObj = PacbPCOORDS(input=(
                        targetSPexon.proteinsequence(),
                        informantSPexon.proteinsequence(),
                        targetSPexon.protein_start(),
                        informantSPexon.protein_start(),
                        ) )

                if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]:
                    # *NOT* placable in current ordered list of PacbPORFS
                    continue

                dist = pacbpCoordsObj.distance_towards(thepacbporfs[0])
                if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH/3:
                    # WAY TO FAR in front of current gene structure parts.
                    # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS
                    continue
                elif dist == 0:
                    # NOT placeable in front of the rest of the PacbPORFS.
                    continue
                else:
                    pass

                # perform ClustalW alignment on the SP exons
                    (alignedseqs,alignment) =\
                clustalw( seqs= { 
                    OPTIONS.target: targetSPexon.proteinsequence(),
                    informant: informantSPexon.proteinsequence() } )

                # make pacbp from clustalw alignment
                pacbp = pacbp_from_clustalw(
                            alignment=(
                                    alignedseqs[OPTIONS.target],
                                    alignment,
                                    alignedseqs[informant]
                                    ),
                            coords=coords
                            )

                # is there any alignment constructed?
                if not pacbp: continue

                # ignore (very) poor identyscore alignments
                if pacbp.identityscore < min_pacbporf_identityscore: continue

                # if here make extended pacbpORF
                signalpexonPacbpORF = pacbp2pacbporf(pacbp,
                        targetSPexon.orf,informantSPexon.orf)
                signalpexonPacbpORF.extend_pacbporf_after_stops()
                # and store in signalpexon_pacbp_list
                signalpexon_pacbp_list.append( signalpexonPacbpORF )

                ################################################################
                if verbose:
                    print alignedseqs[OPTIONS.target], OPTIONS.target
                    print alignment
                    print alignedseqs[informant], informant
                    if pacbp:
                        print pacbp, (OPTIONS.target, targetSPexon.orf.id),
                        print (informant, informantSPexon.orf.id),
                        print "DISTANCE::", dist
                        pacbp.print_protein()
                        print ""
                ################################################################

            # If there are signalpexon-guided pacbporfs found, store the one
            # with the highest bitscore
            if signalpexon_pacbp_list:
                signalpexon_pacbp_list = order_list_by_attribute(
                        signalpexon_pacbp_list,order_by='bits',reversed=True)
                # store best bitscoring pacbporf to PCG
                signalp_pacbporf = signalpexon_pacbp_list[0]
                pacbporf2PCG(signalp_pacbporf,OPTIONS.target,informant,PCG,source='SignalP-ClustalW') 
                is_any_pacbporf_added = True
                ####################################################################
                if verbose:
                    print "SignalP Exon added to PCG:", signalp_pacbporf, informant
                ####################################################################
            else:
                pass

    # return pointer is_any_pacbporf_added
    return is_any_pacbporf_added