def multiplealignment(self):
        """ """
        # get sequences & coordinated and rewrite Nodes to Organism identifiers
        seqs,coords = self.get_maxsr_proteinsequences_and_coords()
        coords = dict([ (self.organism_by_node(node),[min(vlist),max(vlist)+1]) for node,vlist in coords.iteritems() ])
        seqs   = dict([ (self.organism_by_node(node),seq) for node,seq in seqs.iteritems() ])

        # align sequences with ClustalW
        (alignedseqs,alignment) = clustalw( seqs= seqs )
        # trim alignment for leading & trailing gaps
        alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(alignedseqs,alignment,coords)
        # return single string of multilined fasta
        return "\n".join([">%s_orf_%s\n%s" % (k,self.node_by_organism(k)[1],v) for k,v in alignedseqs.iteritems()])
Example #2
0
    def multiplealignment(self):
        """ """
        # get sequences & coordinated and rewrite Nodes to Organism identifiers
        seqs, coords = self.get_maxsr_proteinsequences_and_coords()
        coords = dict([(self.organism_by_node(node),
                        [min(vlist), max(vlist) + 1])
                       for node, vlist in coords.iteritems()])
        seqs = dict([(self.organism_by_node(node), seq)
                     for node, seq in seqs.iteritems()])

        # align sequences with ClustalW
        (alignedseqs, alignment) = clustalw(seqs=seqs)
        # trim alignment for leading & trailing gaps
        alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps(
            alignedseqs, alignment, coords)
        # return single string of multilined fasta
        return "\n".join([
            ">%s_orf_%s\n%s" % (k, self.node_by_organism(k)[1], v)
            for k, v in alignedseqs.iteritems()
        ])
Example #3
0
 def get_unguided_nt_identity(self):
     """ Get identity% of UNGUIDED DNA alignment """
     # if zerosized -> return 0.0
     if self.length == 0: return 0.0
     # get DNA sequences
     dnaQ,dnaS = self.get_aligned_dna_sequences()
     dnaQ,dnaS = dnaQ.replace("-",""), dnaS.replace("-","")
     # make (semi) unique headers
     uniqueid = get_random_string_tag()
     (qs,qe,ss,se) = self.barcode()[0:4]
     headerQ = "query%s%s%s" % (qs,qe,uniqueid)
     headerS = "sbjct%s%s%s" % (ss,se,uniqueid)
     # prepare & run clustalw
     seqs    = { headerQ: dnaQ, headerS: dnaS }
     out,alignment = clustalw( seqs=seqs )
     # get id% on aligned dna sequences
     cnt = 0
     for pos in range(0,len(out[headerQ])):
         if out[headerQ][pos] == out[headerS][pos]:
             cnt+=1
     # return relative ratio
     return float(cnt) / len(out[headerQ])
Example #4
0
 def get_unguided_nt_identity(self):
     """ Get identity% of UNGUIDED DNA alignment """
     # if zerosized -> return 0.0
     if self.length == 0: return 0.0
     # get DNA sequences
     dnaQ, dnaS = self.get_aligned_dna_sequences()
     dnaQ, dnaS = dnaQ.replace("-", ""), dnaS.replace("-", "")
     # make (semi) unique headers
     uniqueid = get_random_string_tag()
     (qs, qe, ss, se) = self.barcode()[0:4]
     headerQ = "query%s%s%s" % (qs, qe, uniqueid)
     headerS = "sbjct%s%s%s" % (ss, se, uniqueid)
     # prepare & run clustalw
     seqs = {headerQ: dnaQ, headerS: dnaS}
     out, alignment = clustalw(seqs=seqs)
     # get id% on aligned dna sequences
     cnt = 0
     for pos in range(0, len(out[headerQ])):
         if out[headerQ][pos] == out[headerS][pos]:
             cnt += 1
     # return relative ratio
     return float(cnt) / len(out[headerQ])
Example #5
0
def make_pacbps_for_edges(gra,aa_extra_offset=1,verbose=False):
    """
    """
    coordsandseqs = {}
    # create dummy omsr attribute!
    # omsr is filled with Exon.acceptor and Exon.donor positions
    # recalculate nt positions to aa positions!
    for node in gra.get_ordered_nodes():
        accep = gra._node_object[node].acceptor
        donor = gra._node_object[node].donor
        aaStart = accep.pos / 3
        if donor.pos - accep.pos % 3 == 0:
            aaEnd = donor.pos / 3
        else:
            aaEnd = (donor.pos / 3) +1
        # get orf, seequence coordinates and sequence itself
        theorg = gra._organism_from_node(node)
        theorf = gra.get_orfs_of_graph(node=node)[0]
        aaStart -= aa_extra_offset
        aaEnd   += aa_extra_offset
        # correct end coordinates when falling outside of Orf
        if aaEnd > theorf.protein_endPY: aaEnd = theorf.protein_endPY
        if aaStart < theorf.protein_startPY: aaStart = theorf.protein_startPY
        theseq = theorf.getaas(abs_pos_start=aaStart,abs_pos_end=aaEnd)
        # store to dict
        coordsandseqs[node] = (theseq,theorg,theorf,aaStart,aaEnd)


    for (node1,node2) in gra.pairwisecrosscombinations_node():
        # check if these are nodes present as an edge
        if not gra.has_edge(node1,node2): continue

        # start makeing a Pacbp from clustalw
        (seq1,org1,orf1,aa1start,aa1end) = coordsandseqs[node1]
        (seq2,org2,orf2,aa2start,aa2end) = coordsandseqs[node2]

        # create headers and fetch sequences from Orf objects
        header1  = "%s_orf_%s_%s_%s" % (org1,orf1.id,aa1start,aa1end)
        header2  = "%s_orf_%s_%s_%s" % (org2,orf2.id,aa2start,aa2end)

        # check if sequences exist/ at least 1 AA
        if not seq1 and not seq2:
            print "Warning: ZeroProteinSequenceLengthException", "S1", aa1start, aa1end, node1, node2, orf1
            print "Warning: ZeroProteinSequenceLengthException", "S2", aa2start, aa2end, node1, node2, orf2
            continue
        elif not seq2:
            print "Warning: ZeroProteinSequenceLengthException", "S2", aa2start, aa2end, node1, node2, orf2
            continue
        elif not seq1:
            print "Warning: ZeroProteinSequenceLengthException", "S1", aa1start, aa1end, node1, node2, orf1
            continue
        else:
            pass

        # align the sequences with clustalw
        seqs = { header1: seq1, header2: seq2 }
        (alignedseqs,alignment) = clustalw(seqs=seqs)

        # make pacbp from clustalw alignment
        pacbp = pacb.conversion.pacbp_from_clustalw(
                    alignment=(
                            alignedseqs[header1],
                            alignment,
                            alignedseqs[header2]
                            ),
                    coords=(aa1start,aa1end,aa2start,aa2end)
                    )

        if pacbp:
            # make & extend PacbPORF
            pacbporf   = pacb.PacbPORF(pacbp,orf1,orf2)
            pacbporf.extend_pacbporf_after_stops()
            # update edge weight
            #new_wt = pacbporf.bitscore
            # wt was sum(PSSM) * distance ratio
            # now multiply with identityscore (0.0-1.0) float too
            new_wt = pacbporf.identityscore * gra.get_edge_weight(node1,node2)
            gra.set_edge_weight(node1,node2,wt=new_wt)
            # add pacbporf to CBG
            key = pacbporf.construct_unique_key(node1,node2)
            gra.pacbps[(key,node1,node2)] = pacbporf
        else:
            # pacbp.conversion.pacbp_from_clustalw did
            # not yield any proper alignment
            if verbose: print "NO PACBP!!", node1,node2, seq1,seq2
            pass
Example #6
0
def update_PCG_with_signalpexons(signalpexonseqs,
                                 PCG,
                                 OPTIONS,
                                 min_pacbporf_identityscore=0.20,
                                 verbose=True):
    """ """
    if not signalpexonseqs.has_key(OPTIONS.target): return False
    is_any_pacbporf_added = False
    for targetSPexon in signalpexonseqs[OPTIONS.target]:
        target = OPTIONS.target
        for informant, infSPlist in signalpexonseqs.iteritems():
            if informant == OPTIONS.target: continue
            # check if informant has been deleted in the meanwhile
            if informant not in PCG.organism_set(): continue
            # list to store signalp exons into
            signalpexon_pacbp_list = []
            # get ordered pacbporfs fromt he PCG
            thepacbporfs = order_pacbporf_list(
                PCG.get_pacbps_by_organisms(OPTIONS.target, informant))
            if not thepacbporfs:
                # no alignments present for this organism (can happen!)
                continue
            for informantSPexon in infSPlist:
                coords = [
                    targetSPexon.protein_start(),
                    targetSPexon.protein_end(),
                    informantSPexon.protein_start(),
                    informantSPexon.protein_end(),
                ]

                # prior to making ClustalW-PacbP, check PacbPCOORD placeability
                # into the list of pacbporfs
                pacbpCoordsObj = PacbPCOORDS(input=(
                    targetSPexon.proteinsequence(),
                    informantSPexon.proteinsequence(),
                    targetSPexon.protein_start(),
                    informantSPexon.protein_start(),
                ))

                if False in [
                        pacbpCoordsObj.is_positioned_compatibly(pacbporf)
                        for pacbporf in thepacbporfs
                ]:
                    # *NOT* placable in current ordered list of PacbPORFS
                    continue

                dist = pacbpCoordsObj.distance_towards(thepacbporfs[0])
                if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH / 3:
                    # WAY TO FAR in front of current gene structure parts.
                    # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS
                    continue
                elif dist == 0:
                    # NOT placeable in front of the rest of the PacbPORFS.
                    continue
                else:
                    pass

                    # perform ClustalW alignment on the SP exons
                    (alignedseqs,alignment) =\
                clustalw( seqs= {
                    OPTIONS.target: targetSPexon.proteinsequence(),
                    informant: informantSPexon.proteinsequence() } )

                # make pacbp from clustalw alignment
                pacbp = pacbp_from_clustalw(
                    alignment=(alignedseqs[OPTIONS.target], alignment,
                               alignedseqs[informant]),
                    coords=coords)

                # is there any alignment constructed?
                if not pacbp: continue

                # ignore (very) poor identyscore alignments
                if pacbp.identityscore < min_pacbporf_identityscore: continue

                # if here make extended pacbpORF
                signalpexonPacbpORF = pacbp2pacbporf(pacbp, targetSPexon.orf,
                                                     informantSPexon.orf)
                signalpexonPacbpORF.extend_pacbporf_after_stops()
                # and store in signalpexon_pacbp_list
                signalpexon_pacbp_list.append(signalpexonPacbpORF)

                ################################################################
                if verbose:
                    print alignedseqs[OPTIONS.target], OPTIONS.target
                    print alignment
                    print alignedseqs[informant], informant
                    if pacbp:
                        print pacbp, (OPTIONS.target, targetSPexon.orf.id),
                        print(informant, informantSPexon.orf.id),
                        print "DISTANCE::", dist
                        pacbp.print_protein()
                        print ""
                ################################################################

            # If there are signalpexon-guided pacbporfs found, store the one
            # with the highest bitscore
            if signalpexon_pacbp_list:
                signalpexon_pacbp_list = order_list_by_attribute(
                    signalpexon_pacbp_list, order_by='bits', reversed=True)
                # store best bitscoring pacbporf to PCG
                signalp_pacbporf = signalpexon_pacbp_list[0]
                pacbporf2PCG(signalp_pacbporf,
                             OPTIONS.target,
                             informant,
                             PCG,
                             source='SignalP-ClustalW')
                is_any_pacbporf_added = True
                ####################################################################
                if verbose:
                    print "SignalP Exon added to PCG:", signalp_pacbporf, informant
                ####################################################################
            else:
                pass

    # return pointer is_any_pacbporf_added
    return is_any_pacbporf_added
Example #7
0
def _create_hmm_profile(cbg,
                        area="OMSR",
                        prevcbg=None,
                        nextcbg=None,
                        strip_nonaligned_residues=False,
                        verbose=False,
                        **kwargs):
    """
    """
    # area must be one of
    # OMSR MINSR MAXSR
    # LEFTSPRDIF RIGTHSPRDIF
    # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF
    # RIGTHORFEND

    # update to default value
    if not kwargs.has_key('sprdif_min_aa_length'):
        kwargs['sprdif_min_aa_length'] = 20

    if area == "OMSR":
        if cbg.has_overall_minimal_spanning_range():
            coords = cbg.overall_minimal_spanning_range()
        else:
            return None, {}
    elif area == "MINSR":
        if cbg.has_minimal_spanning_range():
            coords = cbg.minimal_spanning_range()
        else:
            return None, {}
    elif area == "MAXSR":
        if cbg.has_maximal_spanning_range():
            coords = cbg.maximal_spanning_range()
        else:
            return None, {}
    elif area == "LEFTSPRDIF":
        if cbg.has_left_spanningrange_difference(**kwargs):
            coords = cbg.left_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "RIGTHSPRDIF":
        if cbg.has_rigth_spanningrange_difference(**kwargs):
            coords = cbg.rigth_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "OMSRANDLEFTSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_left_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.left_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords, verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node, coordrange in coords.iteritems():
            coords[node] = Set(range(min(coordrange), max(omsr[node]) + 1))
    elif area == "OMSRANDRIGTHSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_rigth_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.rigth_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords, verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node, coordrange in coords.iteritems():
            coords[node] = Set(range(min(omsr[node]), max(coordrange) + 1))
    elif area == "RIGTHORFEND":
        # area in between MAXSR and orfend
        if not cbg.has_maximal_spanning_range(): return None, {}
        # get coords & obtain Orf ends
        coords = cbg.maximal_spanning_range()
        nodes = coords.keys()
        for node in nodes:
            organism = cbg.organism_by_node(node)
            theorf = cbg.get_orfs_of_graph(organism=organism)[0]
            coords[node] = range(max(coords[node]) + 1, theorf.protein_endPY)
            # remove zero-length ranges
            if len(coords[node]) == 0: del (coords[node])
    else:
        raise "WHAT ELSE!?"

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # decrease coord range by prevcbg if applicable
    if area in ["MAXSR", "LEFTSPRDIF", "OMSRANDLEFTSPRDIF"] and prevcbg:
        omsr = prevcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection(prevcbg.organism_set()):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg = cbg.get_organism_nodes(org)[0]
            nodePrev = prevcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodePrev): continue
            sta = max([max(omsr[nodePrev]) + 1, min(coords[nodeCbg])])
            end = max(coords[nodeCbg]) + 1
            coords[nodeCbg] = Set(range(sta, end))
            if not coords[nodeCbg]: del (coords[nodeCbg])

    # decrease coord range by nextcbg if applicable
    if area in ["MAXSR", "RIGTHSPRDIF", "OMSRANDRIGTHSPRDIF"] and nextcbg:
        omsr = nextcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection(nextcbg.organism_set()):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg = cbg.get_organism_nodes(org)[0]
            nodeNext = nextcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodeNext): continue
            sta = min(coords[nodeCbg])
            end = min([min(omsr[nodeNext]), max(coords[nodeCbg]) + 1])
            coords[nodeCbg] = Set(range(sta, end))
            if not coords[nodeCbg]: del (coords[nodeCbg])

    # check if coords still present
    if not coords: return None, {}

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # do/redo _remove_short_sprdif_contributors id required
    if area in [
            "MAXSR", "LEFTSPRDIF", "RIGTHSPRDIF", "OMSRANDLEFTSPRDIF",
            "OMSRANDRIGTHSPRDIF", "RIGTHORFEND"
    ]:
        coords = _remove_short_sprdif_contributors(coords)

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # check if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # check sprdif_min_aa_length if applicable
    if area in [
            "RIGTHSPRDIF", "LEFTSPRDIF", "OMSRANDRIGTHSPRDIF",
            "OMSRANDLEFTSPRDIF"
    ]:
        maxlength = max([len(vlist) for vlist in coords.values()])
        if maxlength < kwargs['sprdif_min_aa_length']:
            return None, {}

    # if here, obtain sequences and build HMM search profile

    # get fasta sequences and
    fastaseqs = cbg._get_sequences_by_coords(coords)

    # rewrite dict (node) keys to string keys
    fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords)

    # remove empty sequence strings from fastaseqs dict
    empty_seq_keys = []
    for k, seq in fastaseqs.iteritems():
        if seq == "" or len(seq) == 1:
            empty_seq_keys.append(k)
    for k in empty_seq_keys:
        del (coords[k])
        del (fastaseqs[k])

    # check (again) if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # rewrite coords to (min,max) tuple
    coords = dict([(key, [min(vlist), max(vlist) + 1])
                   for key, vlist in coords.iteritems()])

    # perform clustalw multiple alignment
    (alignedseqs, alignment) = clustalw(seqs=fastaseqs)

    # strip exterior gaps in case of OMSR/MINSR area
    if area in ["OMSR", "MINSR"]:
        alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))

    # strip poorly conserved residues in case of RIGTHORFEND
    if area in ["RIGTHORFEND"]:
        alignedseqs, alignment, coords = strip_poorly_supported_tails(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords), 0.20)

    # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID!
    if strip_nonaligned_residues:
        alignedseqs, alignment, coords = strip_overall_nonaligned_residues(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))
        # check if alignment was completely consumed or not
        if not alignment or len(alignment) <= 1:
            return None, {}

    ############################################################################
    if verbose:
        print "## HMM clustalw input profile:", prevcbg != None, area, nextcbg != None
        for node, algseq in alignedseqs.iteritems():
            print algseq, node, coords[node]
        print alignment
    ############################################################################

    # make unique filename for hmm profile file
    fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag()

    # write multiple alignment input file
    writeMultiFasta(alignedseqs, fname_hmm_profile)

    # make hmmbuild file of the multiplealignment
    fname_hmmbuild_file = hmmbuild_protein(fname_hmm_profile)

    # remove hmm profile multiple alignment file
    osRemove(fname_hmm_profile)

    # return HMM serach profile filename
    return fname_hmmbuild_file, coords
Example #8
0
def hmmhit2pacbp(queryorf,
                 queryorg,
                 querycoords,
                 sbjctorf,
                 sbjctorg,
                 hmmhit,
                 verbose=False):
    """
    """
    # trim hmmhit for unmatched characters
    (sbjct_header, sbjct_start, sbjct_end, query_start, query_end, query,
     match, sbjct, score, expect) = hmmhit

    while match and match[0] == ' ':
        query = query[1:]
        match = match[1:]
        sbjct = sbjct[1:]
        sbjct_start += 1
        query_start += 1
    while match and match[-1] == ' ':
        query = query[0:-1]
        match = match[0:-1]
        sbjct = sbjct[0:-1]
        sbjct_end -= 1
        query_end -= 1

    # get orf, node and AA and DNA coordinates of this sbjct hit;
    # correct for -1 offset in start coordinate!!
    sbjct_aa_start = sbjct_start - 1 + sbjctorf.protein_startPY
    sbjct_aa_end = sbjct_end + sbjctorf.protein_startPY
    sbjctNode = (sbjctorg, sbjctorf.id)
    query = query.replace(".", "-").upper()
    sbjct = sbjct.replace(".", "-").upper()

    ############################################################################
    if verbose:
        print "hmmhit2pacbp CREATING pacbps for organism/orf: (%s,%s)" % (
            sbjctorg, sbjctorf.id)
        print "hmmhit2pacbp Q '%s'" % query
        print "hmmhit2pacbp m '%s'" % match
        print "hmmhit2pacbp S '%s'" % sbjct
        print "hmmQ:", query, query_start, query_end, "gaps:",
        print query.count('-'), len(query)
        print "hmmM:", match
        print "hmmS:", sbjct, sbjctNode, sbjct_aa_start, sbjct_aa_end,
        print "len:", sbjct_aa_end - sbjct_aa_start, len(sbjct)
    ############################################################################

    # get Node and sequence of the query
    queryNode = (queryorg, queryorf.id)
    queryseq = deepcopy(query)

    # calculate query sequence position on queryorf
    query_aa_start = querycoords[0] + query_start - 1
    query_aa_end = query_aa_start + len(queryseq) - queryseq.count('-')

    ############################################################################
    if verbose:
        print "hmmq:", queryseq, queryNode, query_aa_start, query_aa_end,
        print "len:", query_aa_end - query_aa_start, len(queryseq)
    ############################################################################

    # make a deepcopy; sbjct is needed unchanged for the next iteration
    # in the for loop, but here we want to trim of gap sequences
    sbjctseq = deepcopy(sbjct)
    sbjctaastart = deepcopy(sbjct_aa_start)
    sbjctaaend = deepcopy(sbjct_aa_end)
    while queryseq and queryseq[0] == '-':
        queryseq = queryseq[1:]
        sbjctseq = sbjctseq[1:]
        sbjctaastart += 1
    while sbjctseq and sbjctseq[0] == '-':
        queryseq = queryseq[1:]
        sbjctseq = sbjctseq[1:]
        query_aa_start += 1
    while queryseq and queryseq[-1] == '-':
        queryseq = queryseq[0:-1]
        sbjctseq = sbjctseq[0:-1]
        sbjctaaend -= 1
    while sbjctseq and sbjctseq[-1] == '-':
        queryseq = queryseq[0:-1]
        sbjctseq = sbjctseq[0:-1]
        query_aa_end -= 1

    # NEW NEW code in december 2010. Since inwpCBGs are implemented, HMM
    # profiles are build from clustalw alignments which have loosely aligned
    # tails (SPRDIF sequences). Problem with HMM is, that in the result file
    # no information is written on where in teh constructed HMM this hit
    # starts. This **sucks** because special care was taken in ABFGP code to
    # make shure the exact aa-coordinates of the applied sequences to ClustalW
    # are known. Hmmbuild here nullifies this effort by not giving start
    # coordinates. Therefore, we have to check the exact start position
    # of the HMM match on the queryorf.
    if queryseq.replace("-", "") != queryorf.getaas(query_aa_start,
                                                    query_aa_end):
        # obtain (search) query sequence, replace gaps by X symbol
        searchqueryseq = queryseq.upper().replace("-", "X")
        # count length of the query sequence; here IGNORE THE GAPS!!
        seqlen = len(queryseq.upper().replace("-", ""))

        # make fasta sequence dictionary
        seqdict = {
            'query_hmm': searchqueryseq,
            'query_orf': queryorf.protein_sequence,
        }

        # make coords dictionary for remapping
        coords = {
            'query_hmm': [0, seqlen],
            'query_orf': [queryorf.protein_startPY, queryorf.protein_endPY],
        }

        # perform clustalw multiple alignment
        (alignedseqs, alignment) = clustalw(seqs=seqdict)
        # strip exterior gaps
        alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))

        if alignedseqs['query_hmm'].count("-") > 0:
            # in (very) exceptional cases, gaps can be introduced in the
            # clustalw alignment in the HMM seq. This normally does not
            # occur! Fix this here by placing gaps in sbjctseq too.
            sbjctseq_as_list = list(sbjctseq)
            for pos in range(0, len(alignedseqs['query_hmm'])):
                if alignedseqs['query_hmm'][pos] == "-":
                    sbjctseq_as_list.insert(pos, "-")
                if alignedseqs['query_hmm'].find("-", pos) == -1:
                    break
            sbjctseq = "".join(sbjctseq_as_list)

        ########################################################################
        if verbose:
            print "\t", "FALSE::", sbjctseq, "[ WITH GAPS,SBJCT ]"
            print "\t", "FALSE::", queryseq, "[ WITH GAPS ]"
            for k, algseq in alignedseqs.iteritems():
                print "\t", "FALSE::", algseq, k, coords[k], len(algseq)
            print "\t", "FALSE::", sbjctseq, "SBJCT", len(sbjctseq)
            print "\t", "FALSE::", alignment, "ALMNT", len(alignment)
            print "\t", "SOLVED:", len(
                alignedseqs['query_orf']) == len(sbjctseq)
        ########################################################################

        # update query sequence & coordinates
        if len(alignedseqs['query_orf']) == len(sbjctseq):
            queryseq = alignedseqs['query_orf']
            query_aa_start = coords['query_orf'][0]
            query_aa_end = coords['query_orf'][1]
        else:
            # still not identical lengths. ClustalW recovery of HMM hit
            # failed miserably. For now: omit
            # TODO: resolve this case!!
            # example: --filewithloci examples/bilal/CFU_830450.bothss.csv
            # ## HMM clustalw input profile: False MAXSR True
            # FPKGCESGKFINWKTFKANGVNLGAWLAKEKTHDPVW foxga [561, 598]
            # FQRACR--KFID-ETLSAHAL---EWESKEIVPPEVW CFU [357, 388]
            # hmmhit2pacbp CREATING pacbps for organism/orf: (NP1064101[anid],1)
            # hmmhit2pacbp Q 'FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD'
            # hmmhit2pacbp m '+ ka +   F  W   k  + nLG Wl  E   d'
            # hmmhit2pacbp S 'YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID'
            # hmmQ: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD 1 34 gaps: 0 34
            # hmmM: + ka +   F  W   k  + nLG Wl  E   d
            # hmmS: YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID ('NP1064101[anid]', 1) 33 64 len: 31 34
            # hmmq: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD ('CFU', 91) 357 391 len: 34 34
            #         FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID [ WITH GAPS,SBJCT ]
            #         FALSE:: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD [ WITH GAPS ]
            #         FALSE:: FQKACR-------SGKFIDWKT-----------------LKAN----------ALNLGE--W-LAKEKVH query_hmm [0, 33] 70
            #         FALSE:: FQRACRKFIDETLSAHALEWESKEIVPPEVWQRFAEANMLIPNLAALASRMVGEIGIGNAFWRLSVQGLR query_orf [357, 427] 70
            #         FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID SBJCT 71
            #         FALSE:: **:***       *.: ::*::                 * .*           :.:*:  * *: : :: ALMNT 70
            #         SOLVED: False
            # Pacbp creation failed!
            return False, None

    if queryseq and sbjctseq:
        ################################################################
        if len(queryseq) != len(sbjctseq):
            # this will result in a exception to be raised:
            # pacb.exceptions.InproperlyAppliedArgument
            # print data here about what went wrong, then
            # just let the error be raised
            print queryseq, len(queryseq), sbjctseq, len(sbjctseq)
            print hmmhit
            print "Q:", query_aa_start, query_aa_end,
            print query_aa_end - query_aa_start, "len:", len(queryseq)
            print "S:", sbjctaastart, sbjctaaend,
            print sbjctaaend - sbjctaastart, "len:", len(sbjctseq)
        ################################################################
        pacbpinput = (queryseq, sbjctseq, query_aa_start, sbjctaastart)
        pacbp = PacbP(input=pacbpinput)
        # remove consistent internal gaps caused hy HMM profile search
        pacbp.strip_consistent_internal_gaps()
        pacbp.source = 'hmmsearch'
        pacbporf = PacbPORF(pacbp, queryorf, sbjctorf)
        pacbporf.strip_unmatched_ends()
        if pacbporf.length == 0:
            # Pacbp creation failed!
            return False, None
        else:
            pacbporf.extend_pacbporf_after_stops()
            pacbpkey = pacbporf.construct_unique_key(queryNode, sbjctNode)
            # return unique key and pacbporf
            return (pacbpkey, queryNode, sbjctNode), pacbporf
    else:
        # Pacbp creation failed!
        return False, None
Example #9
0
def improvealignment(
        cbg,
        verbose=False,
        allow_3p_optimization=True,
        allow_5p_optimization=True,
        maximal_cbg_identity=CBG_OPTIMIZE_MAXIMAL_IDENTITY,
        clustalw_gap_size=CBG_OPTIMIZE_CLUSTALW_GAP_SIZE,
        optimization_bitscore_ratio=CBG_OPTIMIZE_MINIMAL_BITSCORE_RATIO,
        optimization_identity_ratio=CBG_OPTIMIZE_MINIMAL_IDENTITY_RATIO):
    """
    (Try to) Improve the multiple alignment of this CBG with clustalw

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph instance to optimize 

    @type  verbose: Boolean 
    @param verbose: print debugging/intermediate information to STDOUT 

    @type  allow_3p_optimization: Boolean  
    @param allow_3p_optimization: allow optimization(extension!) on the 3p side

    @type  allow_5p_optimization: Boolean
    @param allow_5p_optimization: allow optimization(extension!) on the 5p side

    @type  maximal_cbg_identity: float
    @param maximal_cbg_identity: do not optimize CBG when its GTG.identity() > this number

    @type  clustalw_gap_size: integer
    @param clustalw_gap_size: split ClustalW-multiplealignment obtained PacbPs on gap size

    @type  optimization_bitscore_ratio: float
    @param optimization_bitscore_ratio: only allow longer ClustalW-PacbPs when at least this ratio towards the original PacbP

    @type  optimization_identity_ratio: float
    @param optimization_identity_ratio: only allow longer ClustalW-PacbPs when at least this ratio towards the original PacbP

    @attention: when a CBG is flanked by a lsrCBG in the GSG, it advised to set allow_*p_optimization to False

    @rtype:  Boolean 
    @return: is the CBG optimized or not
    """
    IS_IMPROVED = False

    # if both allow_*p_optimization are False -> no optimization!
    if not allow_3p_optimization and not allow_5p_optimization:
        return False

    # check if there is a likely chance that we can optimize this cbg
    # This chance is defined by parameter
    if cbg.get_genetree().identity() > maximal_cbg_identity:
        return False

    # gather data of the current cbg to compare before/after clustalw optimization
    current_cbg_total_weight = cbg.total_weight()
    current_cbg_string = str(cbg)
    current_cbg_omsr = cbg.overall_minimal_spanning_range()
    current_cbg_maxsr = cbg.maximal_spanning_range()

    # get the orf's sequences in a dict and do clustalw
    seqs = cbg.getorfproteinsequences()
    (_algseqs, _algm) = clustalw(seqs=seqs)

    # check if there is at least a single aligned position
    if len(_algm) == _algm.count(' '): return False
    # get the position of the first and last aligned AA in the clustalw alignment
    firstalignedpos = 0
    finalalignedpos = len(_algm) - 1
    while _algm[firstalignedpos] == ' ':
        firstalignedpos += 1
    while _algm[finalalignedpos] == ' ':
        finalalignedpos -= 1
    # increase finalalignedpos+=1 for compatibility asa list slice
    finalalignedpos += 1

    # translate clustalw multiple alignment start & end to OMSR coordinates
    # While doing this, check if the current OMSR is fully covered by the
    # ClustalW OMSR. In case of long orf sequences and small CBGs,
    # ClustalW is likely to produce out-of-range alignments!
    newomsr = {}
    OMSR_IS_COMPLETELY_COVERED = True
    for org in seqs.keys():
        orf = cbg.get_orfs_of_graph(organism=org)[0]
        node = cbg.node_by_organism(org)
        omsrstart = orf.protein_startPY + (
            firstalignedpos - _algseqs[org][0:firstalignedpos].count('-'))
        omsrend = omsrstart + (
            finalalignedpos - firstalignedpos -
            _algseqs[org][firstalignedpos:finalalignedpos].count('-'))
        newomsr[org] = (omsrstart, omsrend)
        omsrunion = current_cbg_omsr[node].intersection(
            Set(range(omsrstart, omsrend + 1)))
        if len(omsrunion) < len(current_cbg_omsr[node]):
            OMSR_IS_COMPLETELY_COVERED = False
            if verbose:
                print org, len(omsrunion), " < ", len(current_cbg_omsr[node])
            continue

        if verbose:
            print org, min(current_cbg_omsr[node]), max(
                current_cbg_omsr[node]), "new:", (omsrstart, omsrend),
            print "maxsr:", min(current_cbg_maxsr[node]), max(
                current_cbg_maxsr[node]),
            print node, orf, orf.protein_startPY, orf.protein_endPY,
            print len(_algseqs[org]), len(
                _algseqs[org]) - _algseqs[org].count('-'), orf.length / 3

    # Check if current CBG OMSR is overlapping with clustalw OMSR
    if not OMSR_IS_COMPLETELY_COVERED:
        if verbose: print "NO improvement, ClustalW out-of-range-alignment"
        return False

    #######################################################################
    if verbose:
        linesize = 100
        print "<ClustalW obtained multiple alignment>"
        for offset in range(0, len(_algm), linesize):
            start = firstalignedpos + offset
            end = start + linesize
            if end > finalalignedpos: end = finalalignedpos
            if offset == 0 and finalalignedpos - firstalignedpos < linesize:
                end = finalalignedpos
            for org in seqs.keys():
                print _algseqs[org][start:end], org
            print _algm[start:end]
            print ""
            if end == finalalignedpos: break
        print current_cbg_string
        cbg.printmultiplealignment()
    #######################################################################

    # loop over the pairwise organism combinations and make new pacbps
    # but only if the new OMSR extends the known OMSR.
    # In this process, split the ClustalW PacbpOrfs for gaps
    # of size clustalw_gap_size
    for orgA, orgB in cbg.pairwisecrosscombinations_organism():
        # get the current/original pacbporf
        pacbporf = cbg.get_pacbp_by_organisms(orgA, orgB)
        # are the new multiplealignment OMSR coords bigger as the current ones?
        spos = pacbporf._get_original_alignment_pos_start()
        epos = pacbporf._get_original_alignment_pos_end()
        isextended5p = (newomsr[orgA][0] < spos.query_pos,
                        newomsr[orgB][0] < spos.sbjct_pos)
        isextended3p = (newomsr[orgA][1] - 1 > epos.query_pos,
                        newomsr[orgB][1] - 1 > epos.sbjct_pos)

        # check if there is novel extention and on which side
        extention = None
        if isextended5p == (True, True) and isextended3p == (True, True):
            extention = 'both'  # extention on both sides
        elif isextended5p == (True, True):
            extention = '5p'  # extention on 5p side alone
        elif isextended3p == (True, True):
            extention = '3p'  # extention on 3p side alone
        else:
            # no extention at all -> continue
            continue

        # Check if extention is alowed in this side
        # This check is recommended to be included for CBGs
        # that are neigbored/delimited/separated by lsrCBG(s)
        if not allow_3p_optimization and extention in ['both', '3p']:
            continue  # not alowed!
        if not allow_5p_optimization and extention in ['both', '5p']:
            continue  # not alowed!

        # get orf objects and aligned sequence parts
        orfA = cbg.get_orfs_of_graph(organism=orgA)[0]
        orfB = cbg.get_orfs_of_graph(organism=orgB)[0]
        seqA = _algseqs[orgA][firstalignedpos:finalalignedpos]
        seqB = _algseqs[orgB][firstalignedpos:finalalignedpos]
        nodeQ = cbg.node_by_organism(orgA)
        nodeS = cbg.node_by_organism(orgB)
        # make pacbp from this clustalw alignment and extend it
        alignment = (seqA, _algm[firstalignedpos:finalalignedpos], seqB)
        alignment = _remove_gaps_from_clustalw_alignment(alignment)
        coords = (newomsr[orgA][0], newomsr[orgA][1], newomsr[orgB][0],
                  newomsr[orgB][1])
        newpacbp = pacb.conversion.pacbp_from_clustalw(alignment=alignment,
                                                       coords=coords)

        # check for gaps in the clustalw alignment; if so, split them and select the
        # pacbp that overlaps with the omsr
        if newpacbp.alignment_has_gaps(gap_size=clustalw_gap_size):
            splitted, status = pacb.splitting.split_pacb_on_gaps(
                newpacbp, gapsize=clustalw_gap_size)
            if not status:
                # pacbp cannot be splitted for some reason.
                # Ignore it and continue with the next orgA/orgB comparison
                continue
            split_is_compatible = False
            for splittedpacbp in splitted:
                if splittedpacbp.query_start <= min(current_cbg_omsr[nodeQ]) and\
                splittedpacbp.query_end >= max(current_cbg_omsr[nodeQ]) and\
                splittedpacbp.sbjct_start <= min(current_cbg_omsr[nodeS]) and\
                splittedpacbp.sbjct_end >= max(current_cbg_omsr[nodeS]):
                    newpacbp = splittedpacbp
                    split_is_compatible = True
                    # check - again - if this clustalw-obtained pacbp is an extention
                    newomsr[orgA] = (newpacbp.query_start, newpacbp.query_end)
                    newomsr[orgB] = (newpacbp.sbjct_start, newpacbp.sbjct_end)
                    isextended5p = (newomsr[orgA][0] < spos.query_pos,
                                    newomsr[orgB][0] < spos.sbjct_pos)
                    isextended3p = (newomsr[orgA][1] - 1 > epos.query_pos,
                                    newomsr[orgB][1] - 1 > epos.sbjct_pos)

                    # check if there is novel extention and on which side
                    extention = None
                    if isextended5p == (True, True) and isextended3p == (True,
                                                                         True):
                        extention = 'both'  # extention on both sides
                    elif isextended5p == (True, True):
                        extention = '5p'  # extention on 5p side alone
                    elif isextended3p == (True, True):
                        extention = '3p'  # extention on 3p side alone
                    else:
                        split_is_compatible = False
                    # break out of looping over the splits
                    break

            # check if the split was compatible with the OMSR of the current CBG
            if not split_is_compatible:
                # pacbp splits rigth through the OMSR region we are interested in.
                # Ignore it and continue with the next orgA/orgB comparison
                continue

        # convert (splitted) pacbp into pacbporf
        newpacbporf = pacb.conversion.pacbp2pacbporf(newpacbp, orfA, orfB)

        # now merge the clustalw pacbporf with the existing blast pacbporf
        status3p, status5p = False, False
        if extention in ['3p', 'both']:
            merged, status3p = pacb.merging.merge_pacbporfs(pacbporf,
                                                            newpacbporf,
                                                            'rigth',
                                                            verbose=verbose)
        if extention in ['5p', 'both']:
            if extention == 'both':
                # do not merge `pacbporf` but `merged` -> it is changed 4 lines higher up!
                merged, status5p = pacb.merging.merge_pacbporfs(
                    merged, newpacbporf, 'left', verbose=verbose)
            else:
                merged, status5p = pacb.merging.merge_pacbporfs(
                    pacbporf, newpacbporf, 'left', verbose=verbose)

        if float(pacbporf.bitscore) == 0.0:
            print "ZeroDivisionError in creation!"

        # Only reset the old (pacbporf) by the new (merged) if:
        #  True in (status3p, status5p) AND
        #  orf.bitscore ratio >= optimization_bitscore_ratio AND
        #  orf.identityscore  >= optimization_identity_ratio

        # Be aware of a potential ZeroDivisionError in the bitscore ratio
        try:
            bitscore_ratio_check = (float(merged.bitscore) / float(
                pacbporf.bitscore)) >= optimization_bitscore_ratio
        except ZeroDivisionError:
            # do not take ratio, just check if bigger.
            # by default, optimization_bitscore_ratio < 1.0, so
            # checking for gte is even a more stringent check
            bitscore_ratio_check = merged.bitscore >= pacbporf.bitscore

        # ZeroDivisionError in the identityscore can not/hardly be possible.
        # identityscore == 0 means nothing that is alignable at all!
        # But, just be certain becasue bitscore ratio ZeroDivisionError occurred as well
        try:
            identity_ratio_check = (
                merged.identityscore /
                pacbporf.identityscore) >= optimization_identity_ratio
        except ZeroDivisionError:
            # do not take ratio, just check if bigger.
            # by default, optimization_identity_ratio < 1.0, so
            # checking for gte is even a more stringent check
            identity_ratio_check = merged.identityscore >= pacbporf.identityscore

        if True in (status3p, status5p
                    ) and bitscore_ratio_check and identity_ratio_check:
            # reset 'old' pacbporf by 'merged'
            nodeQ = cbg.node_by_organism(orgA)
            nodeS = cbg.node_by_organism(orgB)
            cbg.remove_pacbp(pacbporf, nodeQ, nodeS)
            # and reset the pacbporf into the cbg
            merged.extend_pacbporf_after_stops()
            merged.source = "clustalw-OPTIMIZED"
            newkey = merged.construct_unique_key(nodeQ, nodeS)
            cbg.pacbps[(newkey, nodeQ, nodeS)] = merged
            IS_IMPROVED = True
            if verbose: print "IMPROVEMENT", orgA, orgB
            ###merged.print_protein(_linesize=150)
        else:
            if verbose: print "DISCARDED", orgA, orgB
            continue

    if IS_IMPROVED:
        cbg.clear_cache()
        cbg.update_edge_weights_by_minimal_spanning_range()
        cbg.create_cache()
        if verbose:
            print "### OPTIMIZED CBG", cbg
            cbg.printmultiplealignment()
        # return status True -> this CBG is optimized!
        return True
    else:
        if verbose: print "### no CBG optimization"
        # return status False -> no CBG optimized!
        return False
Example #10
0
def improvealignment(cbg,verbose=False,
    allow_3p_optimization=True,
    allow_5p_optimization=True,
    maximal_cbg_identity=CBG_OPTIMIZE_MAXIMAL_IDENTITY,
    clustalw_gap_size=CBG_OPTIMIZE_CLUSTALW_GAP_SIZE,
    optimization_bitscore_ratio=CBG_OPTIMIZE_MINIMAL_BITSCORE_RATIO,
    optimization_identity_ratio=CBG_OPTIMIZE_MINIMAL_IDENTITY_RATIO):
    """
    (Try to) Improve the multiple alignment of this CBG with clustalw

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph instance to optimize 

    @type  verbose: Boolean 
    @param verbose: print debugging/intermediate information to STDOUT 

    @type  allow_3p_optimization: Boolean  
    @param allow_3p_optimization: allow optimization(extension!) on the 3p side

    @type  allow_5p_optimization: Boolean
    @param allow_5p_optimization: allow optimization(extension!) on the 5p side

    @type  clustalw_gap_size: integer
    @param clustalw_gap_size: split ClustalW-multiplealignment obtained
                              PacbPs on this AA gap size

    @type  maximal_cbg_identity: float
    @param maximal_cbg_identity: do not optimize CBG when its
                                 GTG.identity() > this number

    @type  optimization_bitscore_ratio: float
    @param optimization_bitscore_ratio: only allow longer ClustalW-PacbPs when
                        at least this ratio towards the original PacbP

    @type  optimization_identity_ratio: float
    @param optimization_identity_ratio: only allow longer ClustalW-PacbPs when
                        at least this ratio towards the original PacbP

    @attention: when a CBG is flanked by a lsrCBG in the GSG, it advised to set
                        allow_*p_optimization to False

    @rtype:  Boolean 
    @return: is the CBG optimized or not
    """
    IS_IMPROVED = False

    # if both allow_*p_optimization are False -> no optimization!
    if not allow_3p_optimization and not allow_5p_optimization:
        return False

    # check if there is a likely chance that we can optimize this cbg
    # This chance is defined by parameter
    if cbg.get_genetree().identity() > maximal_cbg_identity:
        return False

    # gather current CBG data to compare before/after ClustalW optimization
    current_cbg_total_weight = cbg.total_weight()
    current_cbg_string       = str(cbg)
    current_cbg_omsr         = cbg.overall_minimal_spanning_range()
    current_cbg_maxsr        = cbg.maximal_spanning_range()

    # get the orf's sequences in a dict and do clustalw
    seqs = cbg.getorfproteinsequences()
    (_algseqs,_algm) = clustalw(seqs=seqs)

    # check if there is at least a single aligned position
    if len(_algm) == _algm.count(' '):
        return False
    # get position of the first and last aligned AA in the clustalw alignment
    firstalignedpos = 0
    finalalignedpos  = len(_algm)-1
    while _algm[firstalignedpos] == ' ': firstalignedpos+=1
    while _algm[finalalignedpos] == ' ': finalalignedpos-=1
    # increase finalalignedpos+=1 for compatibility asa list slice
    finalalignedpos+=1

    # translate ClustalW multiple alignment start & end to OMSR coordinates
    # While doing this, check if the current OMSR is fully covered by the
    # ClustalW OMSR. In case of long orf sequences and small CBGs,
    # ClustalW is likely to produce out-of-range alignments!
    newomsr = {}
    OMSR_IS_COMPLETELY_COVERED = True
    for org in seqs.keys():
        orf  = cbg.get_orfs_of_graph(organism=org)[0]
        node = cbg.node_by_organism(org)
        omsrstart = orf.protein_startPY +\
            ( firstalignedpos - _algseqs[org][0:firstalignedpos].count('-') )
        omsrend   = omsrstart + ( finalalignedpos - firstalignedpos -\
            _algseqs[org][firstalignedpos:finalalignedpos].count('-') )
        newomsr[org] = (omsrstart,omsrend)
        # get the union between cirrent CBGs OMSR and this novel OMSR
        omsrunion = current_cbg_omsr[node].intersection(
            Set(range( omsrstart, omsrend+1 ) ) )
        if len(omsrunion) < len(current_cbg_omsr[node]):
            # no, OMSR shrunk in stead of increased
            OMSR_IS_COMPLETELY_COVERED = False 
            ####################################################################
            if verbose:
                print org, len(omsrunion), " < ", len(current_cbg_omsr[node])
            ####################################################################
            # continue here; for this Organism identifier no improvement
            continue

        ########################################################################
        if verbose:
            print org, min(current_cbg_omsr[node]), max(current_cbg_omsr[node]),
            print "new:", (omsrstart,omsrend),
            print "maxsr:", min(current_cbg_maxsr[node]),
            print max(current_cbg_maxsr[node]), node, orf ,
            print orf.protein_startPY, orf.protein_endPY, len(_algseqs[org]),
            print len(_algseqs[org])-_algseqs[org].count('-'), orf.length/3
        ########################################################################

    # Check if current CBG OMSR is overlapping with clustalw OMSR
    if not OMSR_IS_COMPLETELY_COVERED:
        ########################################################################
        if verbose: print "NO improvement, ClustalW out-of-range-alignment"
        ########################################################################
        return False 


    ############################################################################
    if verbose:
        linesize=100
        print "<ClustalW obtained multiple alignment>"
        for offset in range(0,len(_algm),linesize):
            start = firstalignedpos + offset
            end   = start + linesize
            if end > finalalignedpos: end = finalalignedpos
            if offset==0 and finalalignedpos-firstalignedpos < linesize:
                end = finalalignedpos 
            for org in seqs.keys():
                print _algseqs[org][start:end], org
            print _algm[start:end]
            print ""
            if end == finalalignedpos: break
        print current_cbg_string
        cbg.printmultiplealignment()
    ############################################################################


    # loop over the pairwise organism combinations and make new pacbps
    # but only if the new OMSR extends the known OMSR.
    # In this process, split the ClustalW PacbpOrfs for gaps
    # of size clustalw_gap_size
    for orgA,orgB in cbg.pairwisecrosscombinations_organism():
        # get the current/original pacbporf
        pacbporf = cbg.get_pacbp_by_organisms(orgA,orgB)
        # check if there is novel extention and on which side
        extention = _does_clustalw_omsr_extend_pacbporf_omsr(
                        pacbporf,orgA,orgB,newomsr)

        # Check if extention is alowed in this side. This check is recommended
        # to be included for CBGs # that are neigbored/delimited by lsrCBG(s)
        if not allow_3p_optimization and extention in ['both','3p']:
            continue    # not alowed!
        if not allow_5p_optimization and extention in ['both','5p']:
            continue    # not alowed!
        if extention == None:
            continue    # not alowed!

        # get orf objects and aligned sequence parts
        orfA  = cbg.get_orfs_of_graph(organism=orgA)[0]
        orfB  = cbg.get_orfs_of_graph(organism=orgB)[0]
        seqA  = _algseqs[orgA][firstalignedpos:finalalignedpos]
        seqB  = _algseqs[orgB][firstalignedpos:finalalignedpos]
        nodeQ = cbg.node_by_organism(orgA)
        nodeS = cbg.node_by_organism(orgB)
        # make pacbp from this clustalw alignment and extend it
        alignment = ( seqA, _algm[firstalignedpos:finalalignedpos], seqB )
        coords = ( newomsr[orgA][0], newomsr[orgA][1],
                   newomsr[orgB][0], newomsr[orgB][1] )
        newpacbp = pacb.conversion.pacbp_from_clustalw(
                        alignment=alignment,coords=coords)

        # check for gaps in the clustalw alignment; if so, split them and
        # select the PacbP that overlaps with the OMSR
        if newpacbp.alignment_has_gaps(gap_size=clustalw_gap_size):
            splitted,status = pacb.splitting.split_pacb_on_gaps(
                    newpacbp,gapsize=clustalw_gap_size)
            if not status:
                # pacbp cannot be splitted for some reason.
                # Ignore it and continue with the next orgA/orgB comparison
                continue
            split_is_compatible = False
            for splittedpacbp in splitted:
                if splittedpacbp.query_start <= min(current_cbg_omsr[nodeQ]) and\
                splittedpacbp.query_end >= max(current_cbg_omsr[nodeQ]) and\
                splittedpacbp.sbjct_start <= min(current_cbg_omsr[nodeS]) and\
                splittedpacbp.sbjct_end >= max(current_cbg_omsr[nodeS]):
                    # this is the splitted PacbP that overlaps with current OMSR
                    newpacbp = splittedpacbp
                    split_is_compatible = True

                    # update the clustalwOMSR coords (newomsr)
                    newomsr[orgA] = (newpacbp.query_start,newpacbp.query_end)
                    newomsr[orgB] = (newpacbp.sbjct_start,newpacbp.sbjct_end)

                    # check - again - if this clustalw OMSR is an extention
                    # check with the ORIGINAL pacbporf!
                    extention = _does_clustalw_omsr_extend_pacbporf_omsr(
                                    pacbporf,orgA,orgB,newomsr)
                    
                    # if no extention, set split as incompatible
                    if extention == None: split_is_compatible = False

                    # break out of looping over the splits
                    break

            # check if the split was compatible with the OMSR of the current CBG
            if not split_is_compatible:
                # pacbp splits rigth through the relevant OMSR region.
                # Ignore it and continue with the next orgA/orgB comparison
                continue

        # If here, convert the (splitted) pacbp into pacbporf
        newpacbporf = pacb.conversion.pacbp2pacbporf(newpacbp,orfA,orfB)

        # now merge the clustalw pacbporf with the existing blast pacbporf
        status3p, status5p = False,False
        if extention in ['3p','both']:
            merged, status3p = pacb.merging.merge_pacbporfs(
                            pacbporf,newpacbporf,'rigth',verbose=verbose)
        if extention in ['5p','both']:
            if extention == 'both':
                # take `merged` as input pacbporf, not `pacbporf` ->
                # it is changed 4 lines higher up!
                merged, status5p = pacb.merging.merge_pacbporfs(
                            merged,newpacbporf,'left',verbose=verbose)
            else:
                merged, status5p = pacb.merging.merge_pacbporfs(
                            pacbporf,newpacbporf,'left',verbose=verbose)
           
        # Only reset the old (pacbporf) by the new (merged) if:
        #  True in (status3p, status5p) AND 
        #  orf.bitscore ratio >= optimization_bitscore_ratio AND
        #  orf.identityscore  >= optimization_identity_ratio

        try:
            # Be aware of a potential ZeroDivisionError in the bitscore ratio
            bitscore_ratio_check = ( float(merged.bitscore) /\
                    float(pacbporf.bitscore) ) >= optimization_bitscore_ratio 
        except ZeroDivisionError:
            # do not take ratio, just check if bigger.
            # by default, optimization_bitscore_ratio < 1.0, so
            # checking for gte is even a more stringent check
            bitscore_ratio_check = merged.bitscore >= pacbporf.bitscore

        try:
            # ZeroDivisionError in the identityscore can not/hardly be possible.
            # Identityscore == 0 means nothing that is alignable at all!
            # But, safety first ...
            identity_ratio_check = ( merged.identityscore /\
                    pacbporf.identityscore ) >= optimization_identity_ratio
        except ZeroDivisionError:
            # do not take ratio, just check if bigger.
            # by default, optimization_identity_ratio < 1.0, so
            # checking for gte is even a more stringent check
            identity_ratio_check = merged.identityscore >=pacbporf.identityscore 


        if True in (status3p, status5p) and\
        bitscore_ratio_check and identity_ratio_check:
            # reset 'old' pacbporf by 'merged'
            nodeQ = cbg.node_by_organism(orgA)
            nodeS = cbg.node_by_organism(orgB)
            cbg.remove_pacbp(pacbporf,nodeQ,nodeS)
            # and reset the pacbporf into the cbg
            merged.extend_pacbporf_after_stops()
            merged.source="clustalw-OPTIMIZED"
            newkey = merged.construct_unique_key(nodeQ,nodeS)
            cbg.pacbps[(newkey,nodeQ,nodeS)] = merged
            IS_IMPROVED = True
            ####################################################################
            if verbose: print "IMPROVEMENT", orgA, orgB
            ####################################################################
        else:
            ####################################################################
            if verbose: print "DISCARDED", orgA, orgB
            ####################################################################
            continue

    if IS_IMPROVED:
        # CBG is succesfully changed. Recreate cache etc.
        cbg.clear_cache()
        cbg.update_edge_weights_by_minimal_spanning_range()
        cbg.create_cache()
        ############################################################
        if verbose:
            print "### OPTIMIZED CBG", cbg
            cbg.printmultiplealignment()
        ############################################################
        # return status True -> this CBG is optimized!
        return True
    else:
        ############################################################
        if verbose: print "### no CBG optimization"
        ############################################################
        # return status False -> no CBG optimized!
        return False
Example #11
0
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD,pacbporfA,
    verbose=False,**kwargs):
    """
    Merge 2 PacbPORF objects by closeby independant gained introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intronQ, intronS, CIGexonPacbPORF )
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    kwargs['allow_phase_shift'] = True
    _update_kwargs(kwargs,KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs['cig_max_aa_length']

    # run regular merge_pacbporfs_with_introns function
    alg_introns = merge_pacbporfs_with_introns(pacbporfD,pacbporfA,verbose=verbose,**kwargs)
    cig_introns = []

    if verbose:
        print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs['cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance']

    # check if there is length congruence between the cig_introns
    for intQ,intS in alg_introns:
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True)
        distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase)
        distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase)
        ########################################################################
        if verbose:
            print (intQ.donor.pos, intQ.acceptor.pos),
            print (intS.donor.pos, intS.acceptor.pos),
            print distDnt, distAnt, kwargs['max_nt_offset']
        ########################################################################
        if abs(distDnt-distAnt) > kwargs['max_nt_offset']:
            # intermediate ciigPacbPORF has query vs sbjct length discrepancy
            # *3 for AA2nt coordinate conversion, +2 to allow different phases
            # e.g. phase difference can give 1AA+2nt difference
            continue
        if intQ.donor.phase == intS.donor.phase and\
        (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if intQ.acceptor.phase == intS.acceptor.phase and\
        (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if abs(distDnt) <= 5 or abs(distDnt) <= 5:
            # most likely a splice site phase shift, not a c.i.g.
            continue

        if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\
        abs(distAnt/3) <= kwargs['cig_max_aa_length']:
            # putatively a closeby independant (intron) gain
            cig_introns.append( ( intQ, intS ) )

    ############################################################################
    if verbose:
        for intQ,intS in cig_introns:
            print "cig?:", (intQ.donor.pos, intQ.acceptor.pos),
            print (intS.donor.pos, intS.acceptor.pos)
    ############################################################################


    # return variable to store found positive cases of CIG into
    found_cig_list = []

    # check if there is some sequence similarity
    for intQ,intS in cig_introns:
        # get alignment positions around query & sbjcts splice sites
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,forced_return=True)
        distD = dQpos - dSpos
        distA = aQpos - aSpos
        distDnt = (dQpos*3 + dQphase) - (dSpos*3 + dSphase)
        distAnt = (aQpos*3 + aQphase) - (aSpos*3 + aSphase)

        if distDnt > 0:   # then, distAnt is as well > 0
            # QUERY is extended on the donor side
            #mode   = "SQ"
            #qStart = pacbporfD._positions[dSpos].query_pos
            #qEnd   = qStart + distD
            #sStart = pacbporfA._positions[aSpos].sbjct_pos
            #sEnd   = sStart + distD
            #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)
            mode  = "SQ"
            qEnd  = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos)
            qStart= qEnd - max([distA,distD])
            sStart= pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos)
            sEnd  = sStart + max([distA,distD])
            qSeq  = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            sSeq  = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)

        else: # distDnt and distAnt are < 0
            ## SBJCT is extended on the donor site
            #mode   = "QS"
            #qStart = pacbporfA._positions[aQpos].query_pos
            #qEnd   = qStart - distA
            #sStart = pacbporfD._positions[dQpos].sbjct_pos
            #sEnd   = sStart - distA
            #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd)
            #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd)
            mode  = "QS"
            qStart= pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos)
            qEnd  = qStart - min([distA,distD])
            sEnd  = pacbporfD.orfS.dnapos2aapos(intS.donor.pos)
            sStart= sEnd + min([distA,distD])
            qSeq  = pacbporfA.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            sSeq  = pacbporfD.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)


        headerQ = "query_%s_%s_%s" % (qStart,qEnd,qSeq)
        headerS = "sbjct_%s_%s_%s" % (sStart,sEnd,sSeq)
        headerQ = headerQ[0:20] # truncate to prevent error
        headerS = headerS[0:20] # truncate to prevent error
        if verbose:
            print mode, (distD,distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt,
            print dQpos, aQpos, dSpos, aSpos
        if not qSeq: continue # superfluous check-doublecheck for sequence
        if not sSeq: continue # superfluous check-doublecheck for sequence

        ####################################################
        # make PacbPORF with ClustalW
        ####################################################
        # align the sequences with clustalw
        seqs = { headerQ: qSeq, headerS: sSeq }
        (alignedseqs,alignment) = clustalw(seqs=seqs)

        # make pacbp from clustalw alignment
        pacbp = pacbp_from_clustalw(
                    alignment=(
                            alignedseqs[headerQ],
                            alignment,
                            alignedseqs[headerS]
                            ),
                    coords=(qStart,qEnd,sStart,sEnd)
                    )

        if not pacbp: continue

        # strip unaligned fraction of this pacbp object, then check length
        pacbp.strip_unmatched_ends()

        if len(pacbp) < kwargs['cig_min_aa_length']:
            continue
        if len(pacbp) > kwargs['cig_max_aa_length']:
            continue

        if pacbp:
            # initialize extended tiny PacbPORF caused by c.i.g.
            if distDnt > 0:
                cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfD.orfQ,pacbporfA.orfS)
            else:
                cig_pacbporf = pacbp2pacbporf(pacbp,pacbporfA.orfQ,pacbporfD.orfS)
            cig_pacbporf.extend_pacbporf_after_stops()
            ####################################################################
            if verbose:
                print pacbp, len(pacbp)
                print cig_pacbporf
                print "CIG:", intQ
                print "CIG:", intS
                print distD, distA, distDnt, distAnt
                cig_pacbporf.print_protein_and_dna()
            ####################################################################

            ####################################################################
            # set some meta-data properties to the intron objects
            ####################################################################


            # add distance score to introns
            # The distance set in merge_pacbporfs_with_introns is large;
            # it is the actual distance between the splice sites. In CIG,
            # the measure for distance is the length difference between
            # the offset between query and sbjct measured on the cig_pacbporf
            intQ._distance = abs(distDnt-distAnt)
            intS._distance = abs(distDnt-distAnt)
    
            if distDnt > 0:   # then, distAnt is as well > 0
                # QUERY is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ,cig_pacbporf,pacbporfA)
                succes = set_apps_intron_sbjct(intS,pacbporfD,cig_pacbporf)
            else:
                # SBJCT is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ,pacbporfD,cig_pacbporf)
                succes = set_apps_intron_sbjct(intS,cig_pacbporf,pacbporfA)

            # set GFF fsource attribute for recognition of intron sources
            intQ._gff['fsource'] = "ABGPcig"
            intS._gff['fsource'] = "ABGPcig"

            # create _linked_to_xxx attributes
            intQ._linked_to_pacbporfs = [ cig_pacbporf ]
            intS._linked_to_pacbporfs = [ cig_pacbporf ]


            # append to found_cig_list
            found_cig_list.append( ( intQ, intS, cig_pacbporf ) )

        else:
            # no alignment possible -> try next
            continue
    
    # return lists of closeby_independant_introns
    return found_cig_list
Example #12
0
def update_PCG_with_signalpexons(signalpexonseqs,PCG,OPTIONS,
    min_pacbporf_identityscore=0.20,verbose=True):
    """ """
    if not signalpexonseqs.has_key(OPTIONS.target): return False
    is_any_pacbporf_added = False
    for targetSPexon in signalpexonseqs[OPTIONS.target]:
        target = OPTIONS.target
        for informant,infSPlist in signalpexonseqs.iteritems():
            if informant == OPTIONS.target: continue
            # check if informant has been deleted in the meanwhile
            if informant not in PCG.organism_set(): continue
            # list to store signalp exons into
            signalpexon_pacbp_list = []
            # get ordered pacbporfs fromt he PCG
            thepacbporfs = order_pacbporf_list(PCG.get_pacbps_by_organisms(OPTIONS.target,informant))
            if not thepacbporfs:
                # no alignments present for this organism (can happen!)
                continue
            for informantSPexon in infSPlist:
                coords  = [ targetSPexon.protein_start(),
                            targetSPexon.protein_end(),
                            informantSPexon.protein_start(),
                            informantSPexon.protein_end(), ]

                # prior to making ClustalW-PacbP, check PacbPCOORD placeability
                # into the list of pacbporfs
                pacbpCoordsObj = PacbPCOORDS(input=(
                        targetSPexon.proteinsequence(),
                        informantSPexon.proteinsequence(),
                        targetSPexon.protein_start(),
                        informantSPexon.protein_start(),
                        ) )

                if False in [ pacbpCoordsObj.is_positioned_compatibly(pacbporf) for pacbporf in thepacbporfs ]:
                    # *NOT* placable in current ordered list of PacbPORFS
                    continue

                dist = pacbpCoordsObj.distance_towards(thepacbporfs[0])
                if dist > SIGNALP_FIRSTEXON_MAX_INTRON_NT_LENGTH/3:
                    # WAY TO FAR in front of current gene structure parts.
                    # Do not allow (pooras a *NOT* placable in current ordered list of PacbPORFS
                    continue
                elif dist == 0:
                    # NOT placeable in front of the rest of the PacbPORFS.
                    continue
                else:
                    pass

                # perform ClustalW alignment on the SP exons
                    (alignedseqs,alignment) =\
                clustalw( seqs= { 
                    OPTIONS.target: targetSPexon.proteinsequence(),
                    informant: informantSPexon.proteinsequence() } )

                # make pacbp from clustalw alignment
                pacbp = pacbp_from_clustalw(
                            alignment=(
                                    alignedseqs[OPTIONS.target],
                                    alignment,
                                    alignedseqs[informant]
                                    ),
                            coords=coords
                            )

                # is there any alignment constructed?
                if not pacbp: continue

                # ignore (very) poor identyscore alignments
                if pacbp.identityscore < min_pacbporf_identityscore: continue

                # if here make extended pacbpORF
                signalpexonPacbpORF = pacbp2pacbporf(pacbp,
                        targetSPexon.orf,informantSPexon.orf)
                signalpexonPacbpORF.extend_pacbporf_after_stops()
                # and store in signalpexon_pacbp_list
                signalpexon_pacbp_list.append( signalpexonPacbpORF )

                ################################################################
                if verbose:
                    print alignedseqs[OPTIONS.target], OPTIONS.target
                    print alignment
                    print alignedseqs[informant], informant
                    if pacbp:
                        print pacbp, (OPTIONS.target, targetSPexon.orf.id),
                        print (informant, informantSPexon.orf.id),
                        print "DISTANCE::", dist
                        pacbp.print_protein()
                        print ""
                ################################################################

            # If there are signalpexon-guided pacbporfs found, store the one
            # with the highest bitscore
            if signalpexon_pacbp_list:
                signalpexon_pacbp_list = order_list_by_attribute(
                        signalpexon_pacbp_list,order_by='bits',reversed=True)
                # store best bitscoring pacbporf to PCG
                signalp_pacbporf = signalpexon_pacbp_list[0]
                pacbporf2PCG(signalp_pacbporf,OPTIONS.target,informant,PCG,source='SignalP-ClustalW') 
                is_any_pacbporf_added = True
                ####################################################################
                if verbose:
                    print "SignalP Exon added to PCG:", signalp_pacbporf, informant
                ####################################################################
            else:
                pass

    # return pointer is_any_pacbporf_added
    return is_any_pacbporf_added
Example #13
0
def _create_hmm_profile(cbg,area="OMSR",prevcbg=None,nextcbg=None,
    strip_nonaligned_residues=False,
    verbose=False,**kwargs):
    """
    """
    # area must be one of 
    # OMSR MINSR MAXSR
    # LEFTSPRDIF RIGTHSPRDIF
    # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF
    # RIGTHORFEND

    # update to default value
    if not kwargs.has_key('sprdif_min_aa_length'):
        kwargs['sprdif_min_aa_length'] = 20

    if area == "OMSR":
        if cbg.has_overall_minimal_spanning_range():
            coords = cbg.overall_minimal_spanning_range()
        else:
            return None, {}
    elif area == "MINSR":
        if cbg.has_minimal_spanning_range():
            coords = cbg.minimal_spanning_range()
        else:
            return None, {}
    elif area == "MAXSR":
        if cbg.has_maximal_spanning_range():
            coords = cbg.maximal_spanning_range()
        else:
            return None, {}
    elif area == "LEFTSPRDIF":
        if cbg.has_left_spanningrange_difference(**kwargs):
            coords = cbg.left_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "RIGTHSPRDIF":
        if cbg.has_rigth_spanningrange_difference(**kwargs):
            coords = cbg.rigth_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "OMSRANDLEFTSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_left_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.left_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords,verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node,coordrange in coords.iteritems():
            coords[node] = Set( range( min(coordrange), max(omsr[node])+1 ) )
    elif area == "OMSRANDRIGTHSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_rigth_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.rigth_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords,verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node,coordrange in coords.iteritems():
            coords[node] = Set( range( min(omsr[node]), max(coordrange)+1 ) )
    elif area == "RIGTHORFEND":
        # area in between MAXSR and orfend
        if not cbg.has_maximal_spanning_range(): return None, {}
        # get coords & obtain Orf ends
        coords = cbg.maximal_spanning_range()
        nodes = coords.keys()
        for node in nodes:
            organism = cbg.organism_by_node(node)
            theorf = cbg.get_orfs_of_graph(organism=organism)[0]
            coords[node] = range(max(coords[node])+1,theorf.protein_endPY)
            # remove zero-length ranges
            if len(coords[node]) == 0: del(coords[node])
    else:
        raise "WHAT ELSE!?"

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # decrease coord range by prevcbg if applicable
    if area in ["MAXSR","LEFTSPRDIF","OMSRANDLEFTSPRDIF"] and prevcbg:
        omsr = prevcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection( prevcbg.organism_set() ):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg  = cbg.get_organism_nodes(org)[0]
            nodePrev = prevcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodePrev): continue
            sta = max( [ max(omsr[nodePrev])+1, min(coords[nodeCbg]) ] )
            end = max(coords[nodeCbg])+1
            coords[nodeCbg] = Set(range(sta,end))
            if not coords[nodeCbg]: del( coords[nodeCbg] )

    # decrease coord range by nextcbg if applicable
    if area in ["MAXSR","RIGTHSPRDIF","OMSRANDRIGTHSPRDIF"] and nextcbg:
        omsr = nextcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection( nextcbg.organism_set() ):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg  = cbg.get_organism_nodes(org)[0]
            nodeNext = nextcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodeNext): continue
            sta = min(coords[nodeCbg])
            end = min( [ min(omsr[nodeNext]), max(coords[nodeCbg])+1 ] )
            coords[nodeCbg] = Set(range(sta,end))
            if not coords[nodeCbg]: del( coords[nodeCbg] )

    # check if coords still present
    if not coords: return None, {}

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # do/redo _remove_short_sprdif_contributors id required
    if area in ["MAXSR","LEFTSPRDIF","RIGTHSPRDIF",
    "OMSRANDLEFTSPRDIF","OMSRANDRIGTHSPRDIF","RIGTHORFEND"]:
        coords = _remove_short_sprdif_contributors(coords)

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # check if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # check sprdif_min_aa_length if applicable
    if area in ["RIGTHSPRDIF","LEFTSPRDIF","OMSRANDRIGTHSPRDIF",
    "OMSRANDLEFTSPRDIF"]:
        maxlength = max([ len(vlist) for vlist in coords.values() ])
        if maxlength < kwargs['sprdif_min_aa_length']:
            return None, {}

    # if here, obtain sequences and build HMM search profile

    # get fasta sequences and 
    fastaseqs = cbg._get_sequences_by_coords(coords)

    # rewrite dict (node) keys to string keys
    fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords)

    # remove empty sequence strings from fastaseqs dict
    empty_seq_keys = []
    for k,seq in fastaseqs.iteritems():
        if seq == "" or len(seq) == 1:
            empty_seq_keys.append(k)
    for k in empty_seq_keys:
        del(coords[k])
        del(fastaseqs[k])

    # check (again) if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # rewrite coords to (min,max) tuple
    coords = dict([ (key,[min(vlist),max(vlist)+1]) for key,vlist in coords.iteritems() ])

    # perform clustalw multiple alignment
    (alignedseqs,alignment) = clustalw( seqs= fastaseqs )


    # strip exterior gaps in case of OMSR/MINSR area
    if area in ["OMSR","MINSR"]:
        alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(
                deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )


    # strip poorly conserved residues in case of RIGTHORFEND
    if area in ["RIGTHORFEND"]:
        alignedseqs,alignment,coords = strip_poorly_supported_tails(
            deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords),0.20 )


    # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID!
    if strip_nonaligned_residues:
        alignedseqs,alignment,coords = strip_overall_nonaligned_residues(
                deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )
        # check if alignment was completely consumed or not
        if not alignment or len(alignment) <= 1:
            return None, {}


    ############################################################################
    if verbose:
        print "## HMM clustalw input profile:",prevcbg!=None,area,nextcbg!=None
        for node,algseq in alignedseqs.iteritems():
            print algseq, node, coords[node]
        print alignment
    ############################################################################

    # make unique filename for hmm profile file
    fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag()

    # write multiple alignment input file
    writeMultiFasta(alignedseqs,fname_hmm_profile)

    # make hmmbuild file of the multiplealignment
    fname_hmmbuild_file = hmmbuild_protein( fname_hmm_profile )

    # remove hmm profile multiple alignment file
    osRemove(fname_hmm_profile)

    # return HMM serach profile filename
    return fname_hmmbuild_file, coords
Example #14
0
def hmmhit2pacbp(queryorf,queryorg,querycoords,sbjctorf,sbjctorg,hmmhit,verbose=False):
    """
    """
    # trim hmmhit for unmatched characters
    ( sbjct_header, sbjct_start, sbjct_end,
      query_start, query_end,
      query, match, sbjct, score, expect ) = hmmhit

    while match and match[0] == ' ':
        query = query[1:]
        match = match[1:]
        sbjct = sbjct[1:]
        sbjct_start+=1
        query_start+=1
    while match and match[-1] == ' ':
        query = query[0:-1]
        match = match[0:-1]
        sbjct = sbjct[0:-1]
        sbjct_end-=1
        query_end-=1

    # get orf, node and AA and DNA coordinates of this sbjct hit;
    # correct for -1 offset in start coordinate!!
    sbjct_aa_start  = sbjct_start - 1 + sbjctorf.protein_startPY
    sbjct_aa_end    = sbjct_end + sbjctorf.protein_startPY
    sbjctNode       = (sbjctorg,sbjctorf.id)
    query           = query.replace(".","-").upper()
    sbjct           = sbjct.replace(".","-").upper()

    ############################################################################
    if verbose:
        print "hmmhit2pacbp CREATING pacbps for organism/orf: (%s,%s)" % (
                sbjctorg,sbjctorf.id)
        print "hmmhit2pacbp Q '%s'" % query
        print "hmmhit2pacbp m '%s'" % match
        print "hmmhit2pacbp S '%s'" % sbjct
        print "hmmQ:", query, query_start, query_end, "gaps:",
        print query.count('-'), len(query)
        print "hmmM:", match
        print "hmmS:", sbjct, sbjctNode, sbjct_aa_start, sbjct_aa_end,
        print "len:", sbjct_aa_end-sbjct_aa_start , len(sbjct)
    ############################################################################

    # get Node and sequence of the query
    queryNode = (queryorg,queryorf.id)
    queryseq  = deepcopy(query)

    # calculate query sequence position on queryorf
    query_aa_start = querycoords[0] + query_start - 1
    query_aa_end   = query_aa_start + len(queryseq) - queryseq.count('-')

    ############################################################################
    if verbose:
        print "hmmq:", queryseq, queryNode, query_aa_start, query_aa_end,
        print "len:", query_aa_end-query_aa_start, len(queryseq)
    ############################################################################

    # make a deepcopy; sbjct is needed unchanged for the next iteration
    # in the for loop, but here we want to trim of gap sequences
    sbjctseq = deepcopy(sbjct)
    sbjctaastart = deepcopy(sbjct_aa_start)
    sbjctaaend   = deepcopy(sbjct_aa_end)
    while queryseq and queryseq[0] == '-':
        queryseq = queryseq[1:]
        sbjctseq = sbjctseq[1:]
        sbjctaastart+=1
    while sbjctseq and sbjctseq[0] == '-':
        queryseq = queryseq[1:]
        sbjctseq = sbjctseq[1:]
        query_aa_start+=1
    while queryseq and queryseq[-1] == '-':
        queryseq = queryseq[0:-1]
        sbjctseq = sbjctseq[0:-1]
        sbjctaaend-=1
    while sbjctseq and sbjctseq[-1] == '-':
        queryseq = queryseq[0:-1]
        sbjctseq = sbjctseq[0:-1]
        query_aa_end-=1

    # NEW NEW code in december 2010. Since inwpCBGs are implemented, HMM
    # profiles are build from clustalw alignments which have loosely aligned
    # tails (SPRDIF sequences). Problem with HMM is, that in the result file
    # no information is written on where in teh constructed HMM this hit
    # starts. This **sucks** because special care was taken in ABFGP code to
    # make shure the exact aa-coordinates of the applied sequences to ClustalW
    # are known. Hmmbuild here nullifies this effort by not giving start
    # coordinates. Therefore, we have to check the exact start position
    # of the HMM match on the queryorf.
    if queryseq.replace("-","") != queryorf.getaas(query_aa_start,query_aa_end):
        # obtain (search) query sequence, replace gaps by X symbol
        searchqueryseq = queryseq.upper().replace("-","X")
        # count length of the query sequence; here IGNORE THE GAPS!!
        seqlen = len(queryseq.upper().replace("-",""))

        # make fasta sequence dictionary
        seqdict = {
            'query_hmm': searchqueryseq,
            'query_orf': queryorf.protein_sequence,
            }

        # make coords dictionary for remapping
        coords = {
            'query_hmm':[0,seqlen],
            'query_orf':[queryorf.protein_startPY,queryorf.protein_endPY],
            }

        # perform clustalw multiple alignment
        (alignedseqs,alignment) = clustalw( seqs= seqdict )
        # strip exterior gaps
        alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(
            deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )

        if alignedseqs['query_hmm'].count("-") > 0:
            # in (very) exceptional cases, gaps can be introduced in the
            # clustalw alignment in the HMM seq. This normally does not
            # occur! Fix this here by placing gaps in sbjctseq too.
            sbjctseq_as_list = list(sbjctseq)
            for pos in range(0,len(alignedseqs['query_hmm'])):
                if alignedseqs['query_hmm'][pos] == "-":
                    sbjctseq_as_list.insert(pos,"-")
                if alignedseqs['query_hmm'].find("-",pos) == -1:
                    break
            sbjctseq = "".join(sbjctseq_as_list)

        ########################################################################
        if verbose:
            print "\t", "FALSE::", sbjctseq, "[ WITH GAPS,SBJCT ]" 
            print "\t", "FALSE::", queryseq, "[ WITH GAPS ]" 
            for k,algseq in alignedseqs.iteritems():
                print "\t", "FALSE::", algseq, k, coords[k], len(algseq)
            print "\t", "FALSE::", sbjctseq, "SBJCT", len(sbjctseq)
            print "\t", "FALSE::", alignment, "ALMNT", len(alignment)
            print "\t", "SOLVED:", len(alignedseqs['query_orf']) == len(sbjctseq)
        ########################################################################
    
        # update query sequence & coordinates
        if len(alignedseqs['query_orf']) == len(sbjctseq):
            queryseq       = alignedseqs['query_orf']
            query_aa_start = coords['query_orf'][0]
            query_aa_end   = coords['query_orf'][1]
        else:
            # still not identical lengths. ClustalW recovery of HMM hit
            # failed miserably. For now: omit
            # TODO: resolve this case!!
            # example: --filewithloci examples/bilal/CFU_830450.bothss.csv
            # ## HMM clustalw input profile: False MAXSR True
            # FPKGCESGKFINWKTFKANGVNLGAWLAKEKTHDPVW foxga [561, 598]
            # FQRACR--KFID-ETLSAHAL---EWESKEIVPPEVW CFU [357, 388]
            # hmmhit2pacbp CREATING pacbps for organism/orf: (NP1064101[anid],1)
            # hmmhit2pacbp Q 'FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD'
            # hmmhit2pacbp m '+ ka +   F  W   k  + nLG Wl  E   d'
            # hmmhit2pacbp S 'YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID'
            # hmmQ: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD 1 34 gaps: 0 34
            # hmmM: + ka +   F  W   k  + nLG Wl  E   d
            # hmmS: YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID ('NP1064101[anid]', 1) 33 64 len: 31 34
            # hmmq: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD ('CFU', 91) 357 391 len: 34 34
            #         FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID [ WITH GAPS,SBJCT ]
            #         FALSE:: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD [ WITH GAPS ]
            #         FALSE:: FQKACR-------SGKFIDWKT-----------------LKAN----------ALNLGE--W-LAKEKVH query_hmm [0, 33] 70
            #         FALSE:: FQRACRKFIDETLSAHALEWESKEIVPPEVWQRFAEANMLIPNLAALASRMVGEIGIGNAFWRLSVQGLR query_orf [357, 427] 70
            #         FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID SBJCT 71
            #         FALSE:: **:***       *.: ::*::                 * .*           :.:*:  * *: : :: ALMNT 70
            #         SOLVED: False
            # Pacbp creation failed!
            return False, None

    if queryseq and sbjctseq:
        ################################################################
        if len(queryseq) != len(sbjctseq):
            # this will result in a exception to be raised:
            # pacb.exceptions.InproperlyAppliedArgument
            # print data here about what went wrong, then
            # just let the error be raised
            print queryseq, len(queryseq), sbjctseq, len(sbjctseq)
            print hmmhit
            print "Q:", query_aa_start, query_aa_end,
            print query_aa_end - query_aa_start, "len:", len(queryseq)
            print "S:", sbjctaastart, sbjctaaend,
            print sbjctaaend - sbjctaastart, "len:",len(sbjctseq)
        ################################################################
        pacbpinput = (queryseq,sbjctseq,query_aa_start,sbjctaastart)
        pacbp      = PacbP(input=pacbpinput)
        # remove consistent internal gaps caused hy HMM profile search
        pacbp.strip_consistent_internal_gaps()
        pacbp.source = 'hmmsearch'
        pacbporf   = PacbPORF(pacbp,queryorf,sbjctorf)
        pacbporf.strip_unmatched_ends()
        if pacbporf.length==0:
            # Pacbp creation failed!
            return False, None
        else:
            pacbporf.extend_pacbporf_after_stops()
            pacbpkey = pacbporf.construct_unique_key(queryNode,sbjctNode)
            # return unique key and pacbporf
            return (pacbpkey,queryNode,sbjctNode), pacbporf
    else:
        # Pacbp creation failed!
        return False, None
Example #15
0
def merge_pacbporfs_with_closeby_independant_introns(pacbporfD,
                                                     pacbporfA,
                                                     verbose=False,
                                                     **kwargs):
    """
    Merge 2 PacbPORF objects by closeby independant gained introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  verbose: Boolean
    @param verbose: print status/debugging messages to STDOUT

    @rtype:  list
    @return: list with ( intronQ, intronS, CIGexonPacbPORF )
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    kwargs['allow_phase_shift'] = True
    _update_kwargs(kwargs, KWARGS_CLOSEBY_INDEPENDANT_INTRON_GAIN)
    if not kwargs.has_key('aligned_site_max_triplet_distance'):
        kwargs['aligned_site_max_triplet_distance'] = kwargs[
            'cig_max_aa_length']

    # run regular merge_pacbporfs_with_introns function
    alg_introns = merge_pacbporfs_with_introns(pacbporfD,
                                               pacbporfA,
                                               verbose=verbose,
                                               **kwargs)
    cig_introns = []

    if verbose:
        print "introns::", len(alg_introns), "cig_max_aa_length:", kwargs[
            'cig_max_aa_length'], kwargs['aligned_site_max_triplet_distance']

    # check if there is length congruence between the cig_introns
    for intQ, intS in alg_introns:
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,
                                                     forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,
                                                     forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,
                                                     forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,
                                                     forced_return=True)
        distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase)
        distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase)
        ########################################################################
        if verbose:
            print(intQ.donor.pos, intQ.acceptor.pos),
            print(intS.donor.pos, intS.acceptor.pos),
            print distDnt, distAnt, kwargs['max_nt_offset']
        ########################################################################
        if abs(distDnt - distAnt) > kwargs['max_nt_offset']:
            # intermediate ciigPacbPORF has query vs sbjct length discrepancy
            # *3 for AA2nt coordinate conversion, +2 to allow different phases
            # e.g. phase difference can give 1AA+2nt difference
            continue
        if intQ.donor.phase == intS.donor.phase and\
        (distDnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if intQ.acceptor.phase == intS.acceptor.phase and\
        (distAnt/3) <= kwargs['aligned_site_max_triplet_distance']:
            # a regularly merged intron combination
            continue
        if abs(distDnt) <= 5 or abs(distDnt) <= 5:
            # most likely a splice site phase shift, not a c.i.g.
            continue

        if abs(distDnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distAnt/3) >= kwargs['cig_min_aa_length'] and\
        abs(distDnt/3) <= kwargs['cig_max_aa_length'] and\
        abs(distAnt/3) <= kwargs['cig_max_aa_length']:
            # putatively a closeby independant (intron) gain
            cig_introns.append((intQ, intS))

    ############################################################################
    if verbose:
        for intQ, intS in cig_introns:
            print "cig?:", (intQ.donor.pos, intQ.acceptor.pos),
            print(intS.donor.pos, intS.acceptor.pos)
    ############################################################################

    # return variable to store found positive cases of CIG into
    found_cig_list = []

    # check if there is some sequence similarity
    for intQ, intS in cig_introns:
        # get alignment positions around query & sbjcts splice sites
        dQpos, dQphase = pacbporfD.dnaposition_query(intQ.donor.pos,
                                                     forced_return=True)
        dSpos, dSphase = pacbporfD.dnaposition_sbjct(intS.donor.pos,
                                                     forced_return=True)
        aQpos, aQphase = pacbporfA.dnaposition_query(intQ.acceptor.pos,
                                                     forced_return=True)
        aSpos, aSphase = pacbporfA.dnaposition_sbjct(intS.acceptor.pos,
                                                     forced_return=True)
        distD = dQpos - dSpos
        distA = aQpos - aSpos
        distDnt = (dQpos * 3 + dQphase) - (dSpos * 3 + dSphase)
        distAnt = (aQpos * 3 + aQphase) - (aSpos * 3 + aSphase)

        if distDnt > 0:  # then, distAnt is as well > 0
            # QUERY is extended on the donor side
            #mode   = "SQ"
            #qStart = pacbporfD._positions[dSpos].query_pos
            #qEnd   = qStart + distD
            #sStart = pacbporfA._positions[aSpos].sbjct_pos
            #sEnd   = sStart + distD
            #qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,abs_pos_end=qEnd)
            #sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,abs_pos_end=sEnd)
            mode = "SQ"
            qEnd = pacbporfD.orfQ.dnapos2aapos(intQ.donor.pos)
            qStart = qEnd - max([distA, distD])
            sStart = pacbporfA.orfS.dnapos2aapos(intS.acceptor.pos)
            sEnd = sStart + max([distA, distD])
            qSeq = pacbporfD.orfQ.getaas(abs_pos_start=qStart,
                                         abs_pos_end=qEnd)
            sSeq = pacbporfA.orfS.getaas(abs_pos_start=sStart,
                                         abs_pos_end=sEnd)

        else:  # distDnt and distAnt are < 0
            ## SBJCT is extended on the donor site
            #mode   = "QS"
            #qStart = pacbporfA._positions[aQpos].query_pos
            #qEnd   = qStart - distA
            #sStart = pacbporfD._positions[dQpos].sbjct_pos
            #sEnd   = sStart - distA
            #qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart, abs_pos_end=qEnd)
            #sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart, abs_pos_end=sEnd)
            mode = "QS"
            qStart = pacbporfA.orfQ.dnapos2aapos(intQ.acceptor.pos)
            qEnd = qStart - min([distA, distD])
            sEnd = pacbporfD.orfS.dnapos2aapos(intS.donor.pos)
            sStart = sEnd + min([distA, distD])
            qSeq = pacbporfA.orfQ.getaas(abs_pos_start=qStart,
                                         abs_pos_end=qEnd)
            sSeq = pacbporfD.orfS.getaas(abs_pos_start=sStart,
                                         abs_pos_end=sEnd)

        headerQ = "query_%s_%s_%s" % (qStart, qEnd, qSeq)
        headerS = "sbjct_%s_%s_%s" % (sStart, sEnd, sSeq)
        headerQ = headerQ[0:20]  # truncate to prevent error
        headerS = headerS[0:20]  # truncate to prevent error
        if verbose:
            print mode, (
                distD, distA), qSeq, sSeq, headerQ, headerS, distDnt, distAnt,
            print dQpos, aQpos, dSpos, aSpos
        if not qSeq: continue  # superfluous check-doublecheck for sequence
        if not sSeq: continue  # superfluous check-doublecheck for sequence

        ####################################################
        # make PacbPORF with ClustalW
        ####################################################
        # align the sequences with clustalw
        seqs = {headerQ: qSeq, headerS: sSeq}
        (alignedseqs, alignment) = clustalw(seqs=seqs)

        # make pacbp from clustalw alignment
        pacbp = pacbp_from_clustalw(alignment=(alignedseqs[headerQ], alignment,
                                               alignedseqs[headerS]),
                                    coords=(qStart, qEnd, sStart, sEnd))

        if not pacbp: continue

        # strip unaligned fraction of this pacbp object, then check length
        pacbp.strip_unmatched_ends()

        if len(pacbp) < kwargs['cig_min_aa_length']:
            continue
        if len(pacbp) > kwargs['cig_max_aa_length']:
            continue

        if pacbp:
            # initialize extended tiny PacbPORF caused by c.i.g.
            if distDnt > 0:
                cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfD.orfQ,
                                              pacbporfA.orfS)
            else:
                cig_pacbporf = pacbp2pacbporf(pacbp, pacbporfA.orfQ,
                                              pacbporfD.orfS)
            cig_pacbporf.extend_pacbporf_after_stops()
            ####################################################################
            if verbose:
                print pacbp, len(pacbp)
                print cig_pacbporf
                print "CIG:", intQ
                print "CIG:", intS
                print distD, distA, distDnt, distAnt
                cig_pacbporf.print_protein_and_dna()
            ####################################################################

            ####################################################################
            # set some meta-data properties to the intron objects
            ####################################################################

            # add distance score to introns
            # The distance set in merge_pacbporfs_with_introns is large;
            # it is the actual distance between the splice sites. In CIG,
            # the measure for distance is the length difference between
            # the offset between query and sbjct measured on the cig_pacbporf
            intQ._distance = abs(distDnt - distAnt)
            intS._distance = abs(distDnt - distAnt)

            if distDnt > 0:  # then, distAnt is as well > 0
                # QUERY is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ, cig_pacbporf, pacbporfA)
                succes = set_apps_intron_sbjct(intS, pacbporfD, cig_pacbporf)
            else:
                # SBJCT is extended on the donor side
                # add Alignment Positional Periphery Score into objects
                succes = set_apps_intron_query(intQ, pacbporfD, cig_pacbporf)
                succes = set_apps_intron_sbjct(intS, cig_pacbporf, pacbporfA)

            # set GFF fsource attribute for recognition of intron sources
            intQ._gff['fsource'] = "ABGPcig"
            intS._gff['fsource'] = "ABGPcig"

            # create _linked_to_xxx attributes
            intQ._linked_to_pacbporfs = [cig_pacbporf]
            intS._linked_to_pacbporfs = [cig_pacbporf]

            # append to found_cig_list
            found_cig_list.append((intQ, intS, cig_pacbporf))

        else:
            # no alignment possible -> try next
            continue

    # return lists of closeby_independant_introns
    return found_cig_list