def multiplealignment(self):
        """ """
        # get sequences & coordinated and rewrite Nodes to Organism identifiers
        seqs,coords = self.get_maxsr_proteinsequences_and_coords()
        coords = dict([ (self.organism_by_node(node),[min(vlist),max(vlist)+1]) for node,vlist in coords.iteritems() ])
        seqs   = dict([ (self.organism_by_node(node),seq) for node,seq in seqs.iteritems() ])

        # align sequences with ClustalW
        (alignedseqs,alignment) = clustalw( seqs= seqs )
        # trim alignment for leading & trailing gaps
        alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(alignedseqs,alignment,coords)
        # return single string of multilined fasta
        return "\n".join([">%s_orf_%s\n%s" % (k,self.node_by_organism(k)[1],v) for k,v in alignedseqs.iteritems()])
Example #2
0
    def multiplealignment(self):
        """ """
        # get sequences & coordinated and rewrite Nodes to Organism identifiers
        seqs, coords = self.get_maxsr_proteinsequences_and_coords()
        coords = dict([(self.organism_by_node(node),
                        [min(vlist), max(vlist) + 1])
                       for node, vlist in coords.iteritems()])
        seqs = dict([(self.organism_by_node(node), seq)
                     for node, seq in seqs.iteritems()])

        # align sequences with ClustalW
        (alignedseqs, alignment) = clustalw(seqs=seqs)
        # trim alignment for leading & trailing gaps
        alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps(
            alignedseqs, alignment, coords)
        # return single string of multilined fasta
        return "\n".join([
            ">%s_orf_%s\n%s" % (k, self.node_by_organism(k)[1], v)
            for k, v in alignedseqs.iteritems()
        ])
Example #3
0
def _create_hmm_profile(cbg,
                        area="OMSR",
                        prevcbg=None,
                        nextcbg=None,
                        strip_nonaligned_residues=False,
                        verbose=False,
                        **kwargs):
    """
    """
    # area must be one of
    # OMSR MINSR MAXSR
    # LEFTSPRDIF RIGTHSPRDIF
    # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF
    # RIGTHORFEND

    # update to default value
    if not kwargs.has_key('sprdif_min_aa_length'):
        kwargs['sprdif_min_aa_length'] = 20

    if area == "OMSR":
        if cbg.has_overall_minimal_spanning_range():
            coords = cbg.overall_minimal_spanning_range()
        else:
            return None, {}
    elif area == "MINSR":
        if cbg.has_minimal_spanning_range():
            coords = cbg.minimal_spanning_range()
        else:
            return None, {}
    elif area == "MAXSR":
        if cbg.has_maximal_spanning_range():
            coords = cbg.maximal_spanning_range()
        else:
            return None, {}
    elif area == "LEFTSPRDIF":
        if cbg.has_left_spanningrange_difference(**kwargs):
            coords = cbg.left_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "RIGTHSPRDIF":
        if cbg.has_rigth_spanningrange_difference(**kwargs):
            coords = cbg.rigth_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "OMSRANDLEFTSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_left_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.left_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords, verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node, coordrange in coords.iteritems():
            coords[node] = Set(range(min(coordrange), max(omsr[node]) + 1))
    elif area == "OMSRANDRIGTHSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_rigth_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.rigth_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords, verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node, coordrange in coords.iteritems():
            coords[node] = Set(range(min(omsr[node]), max(coordrange) + 1))
    elif area == "RIGTHORFEND":
        # area in between MAXSR and orfend
        if not cbg.has_maximal_spanning_range(): return None, {}
        # get coords & obtain Orf ends
        coords = cbg.maximal_spanning_range()
        nodes = coords.keys()
        for node in nodes:
            organism = cbg.organism_by_node(node)
            theorf = cbg.get_orfs_of_graph(organism=organism)[0]
            coords[node] = range(max(coords[node]) + 1, theorf.protein_endPY)
            # remove zero-length ranges
            if len(coords[node]) == 0: del (coords[node])
    else:
        raise "WHAT ELSE!?"

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # decrease coord range by prevcbg if applicable
    if area in ["MAXSR", "LEFTSPRDIF", "OMSRANDLEFTSPRDIF"] and prevcbg:
        omsr = prevcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection(prevcbg.organism_set()):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg = cbg.get_organism_nodes(org)[0]
            nodePrev = prevcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodePrev): continue
            sta = max([max(omsr[nodePrev]) + 1, min(coords[nodeCbg])])
            end = max(coords[nodeCbg]) + 1
            coords[nodeCbg] = Set(range(sta, end))
            if not coords[nodeCbg]: del (coords[nodeCbg])

    # decrease coord range by nextcbg if applicable
    if area in ["MAXSR", "RIGTHSPRDIF", "OMSRANDRIGTHSPRDIF"] and nextcbg:
        omsr = nextcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection(nextcbg.organism_set()):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg = cbg.get_organism_nodes(org)[0]
            nodeNext = nextcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodeNext): continue
            sta = min(coords[nodeCbg])
            end = min([min(omsr[nodeNext]), max(coords[nodeCbg]) + 1])
            coords[nodeCbg] = Set(range(sta, end))
            if not coords[nodeCbg]: del (coords[nodeCbg])

    # check if coords still present
    if not coords: return None, {}

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # do/redo _remove_short_sprdif_contributors id required
    if area in [
            "MAXSR", "LEFTSPRDIF", "RIGTHSPRDIF", "OMSRANDLEFTSPRDIF",
            "OMSRANDRIGTHSPRDIF", "RIGTHORFEND"
    ]:
        coords = _remove_short_sprdif_contributors(coords)

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # check if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # check sprdif_min_aa_length if applicable
    if area in [
            "RIGTHSPRDIF", "LEFTSPRDIF", "OMSRANDRIGTHSPRDIF",
            "OMSRANDLEFTSPRDIF"
    ]:
        maxlength = max([len(vlist) for vlist in coords.values()])
        if maxlength < kwargs['sprdif_min_aa_length']:
            return None, {}

    # if here, obtain sequences and build HMM search profile

    # get fasta sequences and
    fastaseqs = cbg._get_sequences_by_coords(coords)

    # rewrite dict (node) keys to string keys
    fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords)

    # remove empty sequence strings from fastaseqs dict
    empty_seq_keys = []
    for k, seq in fastaseqs.iteritems():
        if seq == "" or len(seq) == 1:
            empty_seq_keys.append(k)
    for k in empty_seq_keys:
        del (coords[k])
        del (fastaseqs[k])

    # check (again) if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # rewrite coords to (min,max) tuple
    coords = dict([(key, [min(vlist), max(vlist) + 1])
                   for key, vlist in coords.iteritems()])

    # perform clustalw multiple alignment
    (alignedseqs, alignment) = clustalw(seqs=fastaseqs)

    # strip exterior gaps in case of OMSR/MINSR area
    if area in ["OMSR", "MINSR"]:
        alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))

    # strip poorly conserved residues in case of RIGTHORFEND
    if area in ["RIGTHORFEND"]:
        alignedseqs, alignment, coords = strip_poorly_supported_tails(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords), 0.20)

    # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID!
    if strip_nonaligned_residues:
        alignedseqs, alignment, coords = strip_overall_nonaligned_residues(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))
        # check if alignment was completely consumed or not
        if not alignment or len(alignment) <= 1:
            return None, {}

    ############################################################################
    if verbose:
        print "## HMM clustalw input profile:", prevcbg != None, area, nextcbg != None
        for node, algseq in alignedseqs.iteritems():
            print algseq, node, coords[node]
        print alignment
    ############################################################################

    # make unique filename for hmm profile file
    fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag()

    # write multiple alignment input file
    writeMultiFasta(alignedseqs, fname_hmm_profile)

    # make hmmbuild file of the multiplealignment
    fname_hmmbuild_file = hmmbuild_protein(fname_hmm_profile)

    # remove hmm profile multiple alignment file
    osRemove(fname_hmm_profile)

    # return HMM serach profile filename
    return fname_hmmbuild_file, coords
Example #4
0
def hmmhit2pacbp(queryorf,
                 queryorg,
                 querycoords,
                 sbjctorf,
                 sbjctorg,
                 hmmhit,
                 verbose=False):
    """
    """
    # trim hmmhit for unmatched characters
    (sbjct_header, sbjct_start, sbjct_end, query_start, query_end, query,
     match, sbjct, score, expect) = hmmhit

    while match and match[0] == ' ':
        query = query[1:]
        match = match[1:]
        sbjct = sbjct[1:]
        sbjct_start += 1
        query_start += 1
    while match and match[-1] == ' ':
        query = query[0:-1]
        match = match[0:-1]
        sbjct = sbjct[0:-1]
        sbjct_end -= 1
        query_end -= 1

    # get orf, node and AA and DNA coordinates of this sbjct hit;
    # correct for -1 offset in start coordinate!!
    sbjct_aa_start = sbjct_start - 1 + sbjctorf.protein_startPY
    sbjct_aa_end = sbjct_end + sbjctorf.protein_startPY
    sbjctNode = (sbjctorg, sbjctorf.id)
    query = query.replace(".", "-").upper()
    sbjct = sbjct.replace(".", "-").upper()

    ############################################################################
    if verbose:
        print "hmmhit2pacbp CREATING pacbps for organism/orf: (%s,%s)" % (
            sbjctorg, sbjctorf.id)
        print "hmmhit2pacbp Q '%s'" % query
        print "hmmhit2pacbp m '%s'" % match
        print "hmmhit2pacbp S '%s'" % sbjct
        print "hmmQ:", query, query_start, query_end, "gaps:",
        print query.count('-'), len(query)
        print "hmmM:", match
        print "hmmS:", sbjct, sbjctNode, sbjct_aa_start, sbjct_aa_end,
        print "len:", sbjct_aa_end - sbjct_aa_start, len(sbjct)
    ############################################################################

    # get Node and sequence of the query
    queryNode = (queryorg, queryorf.id)
    queryseq = deepcopy(query)

    # calculate query sequence position on queryorf
    query_aa_start = querycoords[0] + query_start - 1
    query_aa_end = query_aa_start + len(queryseq) - queryseq.count('-')

    ############################################################################
    if verbose:
        print "hmmq:", queryseq, queryNode, query_aa_start, query_aa_end,
        print "len:", query_aa_end - query_aa_start, len(queryseq)
    ############################################################################

    # make a deepcopy; sbjct is needed unchanged for the next iteration
    # in the for loop, but here we want to trim of gap sequences
    sbjctseq = deepcopy(sbjct)
    sbjctaastart = deepcopy(sbjct_aa_start)
    sbjctaaend = deepcopy(sbjct_aa_end)
    while queryseq and queryseq[0] == '-':
        queryseq = queryseq[1:]
        sbjctseq = sbjctseq[1:]
        sbjctaastart += 1
    while sbjctseq and sbjctseq[0] == '-':
        queryseq = queryseq[1:]
        sbjctseq = sbjctseq[1:]
        query_aa_start += 1
    while queryseq and queryseq[-1] == '-':
        queryseq = queryseq[0:-1]
        sbjctseq = sbjctseq[0:-1]
        sbjctaaend -= 1
    while sbjctseq and sbjctseq[-1] == '-':
        queryseq = queryseq[0:-1]
        sbjctseq = sbjctseq[0:-1]
        query_aa_end -= 1

    # NEW NEW code in december 2010. Since inwpCBGs are implemented, HMM
    # profiles are build from clustalw alignments which have loosely aligned
    # tails (SPRDIF sequences). Problem with HMM is, that in the result file
    # no information is written on where in teh constructed HMM this hit
    # starts. This **sucks** because special care was taken in ABFGP code to
    # make shure the exact aa-coordinates of the applied sequences to ClustalW
    # are known. Hmmbuild here nullifies this effort by not giving start
    # coordinates. Therefore, we have to check the exact start position
    # of the HMM match on the queryorf.
    if queryseq.replace("-", "") != queryorf.getaas(query_aa_start,
                                                    query_aa_end):
        # obtain (search) query sequence, replace gaps by X symbol
        searchqueryseq = queryseq.upper().replace("-", "X")
        # count length of the query sequence; here IGNORE THE GAPS!!
        seqlen = len(queryseq.upper().replace("-", ""))

        # make fasta sequence dictionary
        seqdict = {
            'query_hmm': searchqueryseq,
            'query_orf': queryorf.protein_sequence,
        }

        # make coords dictionary for remapping
        coords = {
            'query_hmm': [0, seqlen],
            'query_orf': [queryorf.protein_startPY, queryorf.protein_endPY],
        }

        # perform clustalw multiple alignment
        (alignedseqs, alignment) = clustalw(seqs=seqdict)
        # strip exterior gaps
        alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))

        if alignedseqs['query_hmm'].count("-") > 0:
            # in (very) exceptional cases, gaps can be introduced in the
            # clustalw alignment in the HMM seq. This normally does not
            # occur! Fix this here by placing gaps in sbjctseq too.
            sbjctseq_as_list = list(sbjctseq)
            for pos in range(0, len(alignedseqs['query_hmm'])):
                if alignedseqs['query_hmm'][pos] == "-":
                    sbjctseq_as_list.insert(pos, "-")
                if alignedseqs['query_hmm'].find("-", pos) == -1:
                    break
            sbjctseq = "".join(sbjctseq_as_list)

        ########################################################################
        if verbose:
            print "\t", "FALSE::", sbjctseq, "[ WITH GAPS,SBJCT ]"
            print "\t", "FALSE::", queryseq, "[ WITH GAPS ]"
            for k, algseq in alignedseqs.iteritems():
                print "\t", "FALSE::", algseq, k, coords[k], len(algseq)
            print "\t", "FALSE::", sbjctseq, "SBJCT", len(sbjctseq)
            print "\t", "FALSE::", alignment, "ALMNT", len(alignment)
            print "\t", "SOLVED:", len(
                alignedseqs['query_orf']) == len(sbjctseq)
        ########################################################################

        # update query sequence & coordinates
        if len(alignedseqs['query_orf']) == len(sbjctseq):
            queryseq = alignedseqs['query_orf']
            query_aa_start = coords['query_orf'][0]
            query_aa_end = coords['query_orf'][1]
        else:
            # still not identical lengths. ClustalW recovery of HMM hit
            # failed miserably. For now: omit
            # TODO: resolve this case!!
            # example: --filewithloci examples/bilal/CFU_830450.bothss.csv
            # ## HMM clustalw input profile: False MAXSR True
            # FPKGCESGKFINWKTFKANGVNLGAWLAKEKTHDPVW foxga [561, 598]
            # FQRACR--KFID-ETLSAHAL---EWESKEIVPPEVW CFU [357, 388]
            # hmmhit2pacbp CREATING pacbps for organism/orf: (NP1064101[anid],1)
            # hmmhit2pacbp Q 'FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD'
            # hmmhit2pacbp m '+ ka +   F  W   k  + nLG Wl  E   d'
            # hmmhit2pacbp S 'YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID'
            # hmmQ: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD 1 34 gaps: 0 34
            # hmmM: + ka +   F  W   k  + nLG Wl  E   d
            # hmmS: YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID ('NP1064101[anid]', 1) 33 64 len: 31 34
            # hmmq: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD ('CFU', 91) 357 391 len: 34 34
            #         FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID [ WITH GAPS,SBJCT ]
            #         FALSE:: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD [ WITH GAPS ]
            #         FALSE:: FQKACR-------SGKFIDWKT-----------------LKAN----------ALNLGE--W-LAKEKVH query_hmm [0, 33] 70
            #         FALSE:: FQRACRKFIDETLSAHALEWESKEIVPPEVWQRFAEANMLIPNLAALASRMVGEIGIGNAFWRLSVQGLR query_orf [357, 427] 70
            #         FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID SBJCT 71
            #         FALSE:: **:***       *.: ::*::                 * .*           :.:*:  * *: : :: ALMNT 70
            #         SOLVED: False
            # Pacbp creation failed!
            return False, None

    if queryseq and sbjctseq:
        ################################################################
        if len(queryseq) != len(sbjctseq):
            # this will result in a exception to be raised:
            # pacb.exceptions.InproperlyAppliedArgument
            # print data here about what went wrong, then
            # just let the error be raised
            print queryseq, len(queryseq), sbjctseq, len(sbjctseq)
            print hmmhit
            print "Q:", query_aa_start, query_aa_end,
            print query_aa_end - query_aa_start, "len:", len(queryseq)
            print "S:", sbjctaastart, sbjctaaend,
            print sbjctaaend - sbjctaastart, "len:", len(sbjctseq)
        ################################################################
        pacbpinput = (queryseq, sbjctseq, query_aa_start, sbjctaastart)
        pacbp = PacbP(input=pacbpinput)
        # remove consistent internal gaps caused hy HMM profile search
        pacbp.strip_consistent_internal_gaps()
        pacbp.source = 'hmmsearch'
        pacbporf = PacbPORF(pacbp, queryorf, sbjctorf)
        pacbporf.strip_unmatched_ends()
        if pacbporf.length == 0:
            # Pacbp creation failed!
            return False, None
        else:
            pacbporf.extend_pacbporf_after_stops()
            pacbpkey = pacbporf.construct_unique_key(queryNode, sbjctNode)
            # return unique key and pacbporf
            return (pacbpkey, queryNode, sbjctNode), pacbporf
    else:
        # Pacbp creation failed!
        return False, None
Example #5
0
def _create_hmm_profile(cbg,area="OMSR",prevcbg=None,nextcbg=None,
    strip_nonaligned_residues=False,
    verbose=False,**kwargs):
    """
    """
    # area must be one of 
    # OMSR MINSR MAXSR
    # LEFTSPRDIF RIGTHSPRDIF
    # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF
    # RIGTHORFEND

    # update to default value
    if not kwargs.has_key('sprdif_min_aa_length'):
        kwargs['sprdif_min_aa_length'] = 20

    if area == "OMSR":
        if cbg.has_overall_minimal_spanning_range():
            coords = cbg.overall_minimal_spanning_range()
        else:
            return None, {}
    elif area == "MINSR":
        if cbg.has_minimal_spanning_range():
            coords = cbg.minimal_spanning_range()
        else:
            return None, {}
    elif area == "MAXSR":
        if cbg.has_maximal_spanning_range():
            coords = cbg.maximal_spanning_range()
        else:
            return None, {}
    elif area == "LEFTSPRDIF":
        if cbg.has_left_spanningrange_difference(**kwargs):
            coords = cbg.left_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "RIGTHSPRDIF":
        if cbg.has_rigth_spanningrange_difference(**kwargs):
            coords = cbg.rigth_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "OMSRANDLEFTSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_left_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.left_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords,verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node,coordrange in coords.iteritems():
            coords[node] = Set( range( min(coordrange), max(omsr[node])+1 ) )
    elif area == "OMSRANDRIGTHSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_rigth_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.rigth_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords,verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node,coordrange in coords.iteritems():
            coords[node] = Set( range( min(omsr[node]), max(coordrange)+1 ) )
    elif area == "RIGTHORFEND":
        # area in between MAXSR and orfend
        if not cbg.has_maximal_spanning_range(): return None, {}
        # get coords & obtain Orf ends
        coords = cbg.maximal_spanning_range()
        nodes = coords.keys()
        for node in nodes:
            organism = cbg.organism_by_node(node)
            theorf = cbg.get_orfs_of_graph(organism=organism)[0]
            coords[node] = range(max(coords[node])+1,theorf.protein_endPY)
            # remove zero-length ranges
            if len(coords[node]) == 0: del(coords[node])
    else:
        raise "WHAT ELSE!?"

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # decrease coord range by prevcbg if applicable
    if area in ["MAXSR","LEFTSPRDIF","OMSRANDLEFTSPRDIF"] and prevcbg:
        omsr = prevcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection( prevcbg.organism_set() ):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg  = cbg.get_organism_nodes(org)[0]
            nodePrev = prevcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodePrev): continue
            sta = max( [ max(omsr[nodePrev])+1, min(coords[nodeCbg]) ] )
            end = max(coords[nodeCbg])+1
            coords[nodeCbg] = Set(range(sta,end))
            if not coords[nodeCbg]: del( coords[nodeCbg] )

    # decrease coord range by nextcbg if applicable
    if area in ["MAXSR","RIGTHSPRDIF","OMSRANDRIGTHSPRDIF"] and nextcbg:
        omsr = nextcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection( nextcbg.organism_set() ):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg  = cbg.get_organism_nodes(org)[0]
            nodeNext = nextcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodeNext): continue
            sta = min(coords[nodeCbg])
            end = min( [ min(omsr[nodeNext]), max(coords[nodeCbg])+1 ] )
            coords[nodeCbg] = Set(range(sta,end))
            if not coords[nodeCbg]: del( coords[nodeCbg] )

    # check if coords still present
    if not coords: return None, {}

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # do/redo _remove_short_sprdif_contributors id required
    if area in ["MAXSR","LEFTSPRDIF","RIGTHSPRDIF",
    "OMSRANDLEFTSPRDIF","OMSRANDRIGTHSPRDIF","RIGTHORFEND"]:
        coords = _remove_short_sprdif_contributors(coords)

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # check if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # check sprdif_min_aa_length if applicable
    if area in ["RIGTHSPRDIF","LEFTSPRDIF","OMSRANDRIGTHSPRDIF",
    "OMSRANDLEFTSPRDIF"]:
        maxlength = max([ len(vlist) for vlist in coords.values() ])
        if maxlength < kwargs['sprdif_min_aa_length']:
            return None, {}

    # if here, obtain sequences and build HMM search profile

    # get fasta sequences and 
    fastaseqs = cbg._get_sequences_by_coords(coords)

    # rewrite dict (node) keys to string keys
    fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords)

    # remove empty sequence strings from fastaseqs dict
    empty_seq_keys = []
    for k,seq in fastaseqs.iteritems():
        if seq == "" or len(seq) == 1:
            empty_seq_keys.append(k)
    for k in empty_seq_keys:
        del(coords[k])
        del(fastaseqs[k])

    # check (again) if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # rewrite coords to (min,max) tuple
    coords = dict([ (key,[min(vlist),max(vlist)+1]) for key,vlist in coords.iteritems() ])

    # perform clustalw multiple alignment
    (alignedseqs,alignment) = clustalw( seqs= fastaseqs )


    # strip exterior gaps in case of OMSR/MINSR area
    if area in ["OMSR","MINSR"]:
        alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(
                deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )


    # strip poorly conserved residues in case of RIGTHORFEND
    if area in ["RIGTHORFEND"]:
        alignedseqs,alignment,coords = strip_poorly_supported_tails(
            deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords),0.20 )


    # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID!
    if strip_nonaligned_residues:
        alignedseqs,alignment,coords = strip_overall_nonaligned_residues(
                deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )
        # check if alignment was completely consumed or not
        if not alignment or len(alignment) <= 1:
            return None, {}


    ############################################################################
    if verbose:
        print "## HMM clustalw input profile:",prevcbg!=None,area,nextcbg!=None
        for node,algseq in alignedseqs.iteritems():
            print algseq, node, coords[node]
        print alignment
    ############################################################################

    # make unique filename for hmm profile file
    fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag()

    # write multiple alignment input file
    writeMultiFasta(alignedseqs,fname_hmm_profile)

    # make hmmbuild file of the multiplealignment
    fname_hmmbuild_file = hmmbuild_protein( fname_hmm_profile )

    # remove hmm profile multiple alignment file
    osRemove(fname_hmm_profile)

    # return HMM serach profile filename
    return fname_hmmbuild_file, coords
Example #6
0
def hmmhit2pacbp(queryorf,queryorg,querycoords,sbjctorf,sbjctorg,hmmhit,verbose=False):
    """
    """
    # trim hmmhit for unmatched characters
    ( sbjct_header, sbjct_start, sbjct_end,
      query_start, query_end,
      query, match, sbjct, score, expect ) = hmmhit

    while match and match[0] == ' ':
        query = query[1:]
        match = match[1:]
        sbjct = sbjct[1:]
        sbjct_start+=1
        query_start+=1
    while match and match[-1] == ' ':
        query = query[0:-1]
        match = match[0:-1]
        sbjct = sbjct[0:-1]
        sbjct_end-=1
        query_end-=1

    # get orf, node and AA and DNA coordinates of this sbjct hit;
    # correct for -1 offset in start coordinate!!
    sbjct_aa_start  = sbjct_start - 1 + sbjctorf.protein_startPY
    sbjct_aa_end    = sbjct_end + sbjctorf.protein_startPY
    sbjctNode       = (sbjctorg,sbjctorf.id)
    query           = query.replace(".","-").upper()
    sbjct           = sbjct.replace(".","-").upper()

    ############################################################################
    if verbose:
        print "hmmhit2pacbp CREATING pacbps for organism/orf: (%s,%s)" % (
                sbjctorg,sbjctorf.id)
        print "hmmhit2pacbp Q '%s'" % query
        print "hmmhit2pacbp m '%s'" % match
        print "hmmhit2pacbp S '%s'" % sbjct
        print "hmmQ:", query, query_start, query_end, "gaps:",
        print query.count('-'), len(query)
        print "hmmM:", match
        print "hmmS:", sbjct, sbjctNode, sbjct_aa_start, sbjct_aa_end,
        print "len:", sbjct_aa_end-sbjct_aa_start , len(sbjct)
    ############################################################################

    # get Node and sequence of the query
    queryNode = (queryorg,queryorf.id)
    queryseq  = deepcopy(query)

    # calculate query sequence position on queryorf
    query_aa_start = querycoords[0] + query_start - 1
    query_aa_end   = query_aa_start + len(queryseq) - queryseq.count('-')

    ############################################################################
    if verbose:
        print "hmmq:", queryseq, queryNode, query_aa_start, query_aa_end,
        print "len:", query_aa_end-query_aa_start, len(queryseq)
    ############################################################################

    # make a deepcopy; sbjct is needed unchanged for the next iteration
    # in the for loop, but here we want to trim of gap sequences
    sbjctseq = deepcopy(sbjct)
    sbjctaastart = deepcopy(sbjct_aa_start)
    sbjctaaend   = deepcopy(sbjct_aa_end)
    while queryseq and queryseq[0] == '-':
        queryseq = queryseq[1:]
        sbjctseq = sbjctseq[1:]
        sbjctaastart+=1
    while sbjctseq and sbjctseq[0] == '-':
        queryseq = queryseq[1:]
        sbjctseq = sbjctseq[1:]
        query_aa_start+=1
    while queryseq and queryseq[-1] == '-':
        queryseq = queryseq[0:-1]
        sbjctseq = sbjctseq[0:-1]
        sbjctaaend-=1
    while sbjctseq and sbjctseq[-1] == '-':
        queryseq = queryseq[0:-1]
        sbjctseq = sbjctseq[0:-1]
        query_aa_end-=1

    # NEW NEW code in december 2010. Since inwpCBGs are implemented, HMM
    # profiles are build from clustalw alignments which have loosely aligned
    # tails (SPRDIF sequences). Problem with HMM is, that in the result file
    # no information is written on where in teh constructed HMM this hit
    # starts. This **sucks** because special care was taken in ABFGP code to
    # make shure the exact aa-coordinates of the applied sequences to ClustalW
    # are known. Hmmbuild here nullifies this effort by not giving start
    # coordinates. Therefore, we have to check the exact start position
    # of the HMM match on the queryorf.
    if queryseq.replace("-","") != queryorf.getaas(query_aa_start,query_aa_end):
        # obtain (search) query sequence, replace gaps by X symbol
        searchqueryseq = queryseq.upper().replace("-","X")
        # count length of the query sequence; here IGNORE THE GAPS!!
        seqlen = len(queryseq.upper().replace("-",""))

        # make fasta sequence dictionary
        seqdict = {
            'query_hmm': searchqueryseq,
            'query_orf': queryorf.protein_sequence,
            }

        # make coords dictionary for remapping
        coords = {
            'query_hmm':[0,seqlen],
            'query_orf':[queryorf.protein_startPY,queryorf.protein_endPY],
            }

        # perform clustalw multiple alignment
        (alignedseqs,alignment) = clustalw( seqs= seqdict )
        # strip exterior gaps
        alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(
            deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )

        if alignedseqs['query_hmm'].count("-") > 0:
            # in (very) exceptional cases, gaps can be introduced in the
            # clustalw alignment in the HMM seq. This normally does not
            # occur! Fix this here by placing gaps in sbjctseq too.
            sbjctseq_as_list = list(sbjctseq)
            for pos in range(0,len(alignedseqs['query_hmm'])):
                if alignedseqs['query_hmm'][pos] == "-":
                    sbjctseq_as_list.insert(pos,"-")
                if alignedseqs['query_hmm'].find("-",pos) == -1:
                    break
            sbjctseq = "".join(sbjctseq_as_list)

        ########################################################################
        if verbose:
            print "\t", "FALSE::", sbjctseq, "[ WITH GAPS,SBJCT ]" 
            print "\t", "FALSE::", queryseq, "[ WITH GAPS ]" 
            for k,algseq in alignedseqs.iteritems():
                print "\t", "FALSE::", algseq, k, coords[k], len(algseq)
            print "\t", "FALSE::", sbjctseq, "SBJCT", len(sbjctseq)
            print "\t", "FALSE::", alignment, "ALMNT", len(alignment)
            print "\t", "SOLVED:", len(alignedseqs['query_orf']) == len(sbjctseq)
        ########################################################################
    
        # update query sequence & coordinates
        if len(alignedseqs['query_orf']) == len(sbjctseq):
            queryseq       = alignedseqs['query_orf']
            query_aa_start = coords['query_orf'][0]
            query_aa_end   = coords['query_orf'][1]
        else:
            # still not identical lengths. ClustalW recovery of HMM hit
            # failed miserably. For now: omit
            # TODO: resolve this case!!
            # example: --filewithloci examples/bilal/CFU_830450.bothss.csv
            # ## HMM clustalw input profile: False MAXSR True
            # FPKGCESGKFINWKTFKANGVNLGAWLAKEKTHDPVW foxga [561, 598]
            # FQRACR--KFID-ETLSAHAL---EWESKEIVPPEVW CFU [357, 388]
            # hmmhit2pacbp CREATING pacbps for organism/orf: (NP1064101[anid],1)
            # hmmhit2pacbp Q 'FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD'
            # hmmhit2pacbp m '+ ka +   F  W   k  + nLG Wl  E   d'
            # hmmhit2pacbp S 'YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID'
            # hmmQ: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD 1 34 gaps: 0 34
            # hmmM: + ka +   F  W   k  + nLG Wl  E   d
            # hmmS: YTKAFQ--PF-SWSSAKVRGANLGGWLVQEASID ('NP1064101[anid]', 1) 33 64 len: 31 34
            # hmmq: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD ('CFU', 91) 357 391 len: 34 34
            #         FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID [ WITH GAPS,SBJCT ]
            #         FALSE:: FQKACRSGKFIDWKTLKANALNLGEWLAKEKVHD [ WITH GAPS ]
            #         FALSE:: FQKACR-------SGKFIDWKT-----------------LKAN----------ALNLGE--W-LAKEKVH query_hmm [0, 33] 70
            #         FALSE:: FQRACRKFIDETLSAHALEWESKEIVPPEVWQRFAEANMLIPNLAALASRMVGEIGIGNAFWRLSVQGLR query_orf [357, 427] 70
            #         FALSE:: YTKAFQ---------PF-SWSS-----------------AKVR----------GANLGG--W-LVQEASID SBJCT 71
            #         FALSE:: **:***       *.: ::*::                 * .*           :.:*:  * *: : :: ALMNT 70
            #         SOLVED: False
            # Pacbp creation failed!
            return False, None

    if queryseq and sbjctseq:
        ################################################################
        if len(queryseq) != len(sbjctseq):
            # this will result in a exception to be raised:
            # pacb.exceptions.InproperlyAppliedArgument
            # print data here about what went wrong, then
            # just let the error be raised
            print queryseq, len(queryseq), sbjctseq, len(sbjctseq)
            print hmmhit
            print "Q:", query_aa_start, query_aa_end,
            print query_aa_end - query_aa_start, "len:", len(queryseq)
            print "S:", sbjctaastart, sbjctaaend,
            print sbjctaaend - sbjctaastart, "len:",len(sbjctseq)
        ################################################################
        pacbpinput = (queryseq,sbjctseq,query_aa_start,sbjctaastart)
        pacbp      = PacbP(input=pacbpinput)
        # remove consistent internal gaps caused hy HMM profile search
        pacbp.strip_consistent_internal_gaps()
        pacbp.source = 'hmmsearch'
        pacbporf   = PacbPORF(pacbp,queryorf,sbjctorf)
        pacbporf.strip_unmatched_ends()
        if pacbporf.length==0:
            # Pacbp creation failed!
            return False, None
        else:
            pacbporf.extend_pacbporf_after_stops()
            pacbpkey = pacbporf.construct_unique_key(queryNode,sbjctNode)
            # return unique key and pacbporf
            return (pacbpkey,queryNode,sbjctNode), pacbporf
    else:
        # Pacbp creation failed!
        return False, None