Exemple #1
0
def get_cexpander_string_of_cbg(cbg,verbose=False):
    """
    Run cexpander and get the cexpander binary string for this CBG

    @attention: recommended not to use, use more detailed cexpanderanalyses()

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes
    """
    # (0) create (~unique) basefname
    basefname = "cbg_"+ "".join([ "%s%s" % (node[0],node[1]) for node in cbg.get_ordered_nodes() ])
    fname_fasta     = basefname+".fa"
    fname_allvsall  = basefname+".allvsall"
    fname_aligned   = basefname+".aligned"
    fname_cexpander = basefname+".cexpander"

    # (1) write multi fasta of OMSR sequences
    omsrseqs= cbg.getomsrproteinsequences()
    writeMultiFasta( omsrseqs , fname_fasta )
    strongestN  = cbg.strongest_connected_node()
    strongestO  = cbg.organism_by_node(strongestN)

    # create complete .fa -> cexpanderstring command
    command = """
        python %s %s %s;
        %s -i %s %s > %s;
        %s %s ">%s" > %s;
        cat %s | grep "\$start_values" -A 1000 | grep "\$end_values" -m 1 -B 1000 | sed 's/^.*_values$//' | sed '/^$/d' | tr -d "\\t" | tr -d "\\n"
        """ % (
        EXECUTABLE_CEXPANDER_ALLVSALL, fname_fasta, fname_allvsall,
        EXECUTABLE_CEXPANDER_CBALIGNP, fname_allvsall, "-y", fname_aligned,
        EXECUTABLE_CEXPANDER_CEXPANDER, fname_aligned, strongestO, fname_cexpander,
        fname_cexpander )

    ci,co,ce = osPopen3(command)
    ci.close()
    # output of EXECUTABLE_CEXPANDER_ALLVSALL is cast to STDOUT as well!
    cexpanderstring = co.readlines()[-1]
    co.close()
    error = ce.read()
    ce.close()

    # (6) cleanup files
    osSystem("rm -f %s.*" % basefname )

    ############################################################
    if verbose:
        linesize=100
        print cbg
        for offset in range(0,len(cexpanderstring),linesize):
            print omsrseqs[strongestO][offset:offset+linesize]
            print cexpanderstring[offset:offset+linesize]
    ############################################################

    # (7) return the cexpander string
    return ( cexpanderstring, strongestO )
Exemple #2
0
def get_cexpander_string_of_cbg(cbg, verbose=False):
    """
    Run cexpander and get the cexpander binary string for this CBG

    @attention: recommended not to use, use more detailed cexpanderanalyses()

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes
    """
    # (0) create (~unique) basefname
    basefname = "cbg_" + "".join(
        ["%s%s" % (node[0], node[1]) for node in cbg.get_ordered_nodes()])
    fname_fasta = basefname + ".fa"
    fname_allvsall = basefname + ".allvsall"
    fname_aligned = basefname + ".aligned"
    fname_cexpander = basefname + ".cexpander"

    # (1) write multi fasta of OMSR sequences
    omsrseqs = cbg.getomsrproteinsequences()
    writeMultiFasta(omsrseqs, fname_fasta)
    strongestN = cbg.strongest_connected_node()
    strongestO = cbg.organism_by_node(strongestN)

    # create complete .fa -> cexpanderstring command
    command = """
        python %s %s %s;
        %s -i %s %s > %s;
        %s %s ">%s" > %s;
        cat %s | grep "\$start_values" -A 1000 | grep "\$end_values" -m 1 -B 1000 | sed 's/^.*_values$//' | sed '/^$/d' | tr -d "\\t" | tr -d "\\n"
        """ % (EXECUTABLE_CEXPANDER_ALLVSALL, fname_fasta, fname_allvsall,
               EXECUTABLE_CEXPANDER_CBALIGNP, fname_allvsall, "-y",
               fname_aligned, EXECUTABLE_CEXPANDER_CEXPANDER, fname_aligned,
               strongestO, fname_cexpander, fname_cexpander)

    ci, co, ce = osPopen3(command)
    ci.close()
    # output of EXECUTABLE_CEXPANDER_ALLVSALL is cast to STDOUT as well!
    cexpanderstring = co.readlines()[-1]
    co.close()
    error = ce.read()
    ce.close()

    # (6) cleanup files
    osSystem("rm -f %s.*" % basefname)

    ############################################################
    if verbose:
        linesize = 100
        print cbg
        for offset in range(0, len(cexpanderstring), linesize):
            print omsrseqs[strongestO][offset:offset + linesize]
            print cexpanderstring[offset:offset + linesize]
    ############################################################

    # (7) return the cexpander string
    return (cexpanderstring, strongestO)
Exemple #3
0
def get_frameshifted_cbg(cbg, input, verbose=True):
    """
    Get a CBG with frameshifts (in some of if Orfs) compared to this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to check for frameshifts

    @type  input: dict
    @param input: input <dict data structure> with lists of Orfs

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  CodingBlockGraph or None
    @return: CodingBlockGraph (when existing) or None
    """

    # get elegiable lists of Orfs
    orfs = _get_elegiable_frameshift_orfsets(cbg, input)

    # check how many Orfs are elgiable...
    if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count():
        # no frameshift possible here...
        return None

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        # REMAP fastaheaders as ids to retrieve the Orfs after blast..
        for orf in orfs[org].orfs:
            orf.fastaheader = str(orf.id)
        fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print [orf.id for orf in orfs[org].orfs],
            print [str(orf) for orf in orfs[org].orfs]
Exemple #4
0
def get_reverse_cbg(cbg,frame,verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength()/2)*3
    orfs = get_reverse_strand_orfsets(cbg,frame,min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org,cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(),fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print [len(o.protein_sequence) for o in orfs[org].orfs ]
        ########################################################################

    revpacbps = {}
    for orgQ,orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,orfQ.protein_sequence,
                        dbname="./"+blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(alignment.title.replace(">",""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength()/2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = ( orgQ, orfQ.protein_startPY )
                nodeS = ( orgS, orfS.protein_startPY )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print [ pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values() ]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count()-1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
            edges=cbg.node_count()-1 , max_missing_edges=0 )
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
                cbgList.codingblockgraphs[0])
Exemple #5
0
def get_frameshifted_cbg(cbg, input, verbose=True):
    """
    Get a CBG with frameshifts (in some of if Orfs) compared to this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to check for frameshifts

    @type  input: dict
    @param input: input <dict data structure> with lists of Orfs

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  CodingBlockGraph or None
    @return: CodingBlockGraph (when existing) or None
    """

    # get elegiable lists of Orfs
    orfs = _get_elegiable_frameshift_orfsets(cbg, input)

    # check how many Orfs are elgiable...
    if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count():
        # no frameshift possible here...
        return None

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        # REMAP fastaheaders as ids to retrieve the Orfs after blast..
        for orf in orfs[org].orfs:
            orf.fastaheader = str(orf.id)
        fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[orf.id for orf in orfs[org].orfs],
            print[str(orf) for orf in orfs[org].orfs]
        ########################################################################

    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfid = alignment.title.replace(">", "").split(" ")[0].replace(
                    "_", "")
                orfS = orfs[orgS].get_orf_by_id(int(orfid))

                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                if nodeQ in cbg.get_nodes() and nodeS in cbg.get_nodes():
                    pacbporf = cbg.get_pacbps_by_nodes(node1=nodeQ,
                                                       node2=nodeS)[0]

                else:
                    # take only the *best* HSP (highest scoring first one)
                    hsp = alignment.hsps[0]

                    # correct to absolute positions
                    hsp.query_start = hsp.query_start + orfQ.protein_startPY
                    hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                    # initialize the PacbP
                    pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)
                    ############################################################
                    if verbose: print "NEW:", pacbporf
                    ############################################################

                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        ############################################################
        if verbose: print "org_set_size() PCG < CBG"
        ############################################################
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(cbg.node_count())
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_graphlist_by_total_weight_and_identity()

    ############################################################################
    if verbose:
        print "FScbgs (%s)" % len(cbgList)
        for fscbg in cbgList:
            print fscbg
    ############################################################################

    if not cbgList:
        # no (better) frameshifted CBG
        return None
    elif cbgList and not cbgList[0].node_set().symmetric_difference(
            cbg.node_set()):
        # best CBG is not frameshifted, but CBG itself
        return None
    else:
        # score the difference between the frameshifted and current CBG
        score_cbg = cbg.total_weight() * cbg.omsr_identityscore()
        score_fscbg = cbgList[0].total_weight(
        ) * cbgList[0].omsr_identityscore()
        # check overlap between the frameshifted and current CBG
        a, b, c, d, e, f, g = relatively_positioned_towards(cbgList[0], cbg)

        ########################################################################
        if verbose:
            print "CBG", cbg
            cbg.printmultiplealignment()
            for fscbg in cbgList:
                print "fsCBG:", fscbg
                fscbg.printmultiplealignment()
        ########################################################################

        if (c, d) == ((0, 0, 1), (1, 0, 0)) or (c, d) == ((0, 0, 1),
                                                          (1, 0, 0)):
            # CBG and frameshifted CBG do not share a single AA overlap...
            # This does not represent a frameshifted CBG as we searched for
            return False
        elif score_fscbg > score_cbg:
            # return the highest scoring, frameshifted CBG
            return cbgList[0]
        else:
            # no, still not convinced that this is a frameshifted CBG
            return False
Exemple #6
0
def _create_hmm_profile(cbg,
                        area="OMSR",
                        prevcbg=None,
                        nextcbg=None,
                        strip_nonaligned_residues=False,
                        verbose=False,
                        **kwargs):
    """
    """
    # area must be one of
    # OMSR MINSR MAXSR
    # LEFTSPRDIF RIGTHSPRDIF
    # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF
    # RIGTHORFEND

    # update to default value
    if not kwargs.has_key('sprdif_min_aa_length'):
        kwargs['sprdif_min_aa_length'] = 20

    if area == "OMSR":
        if cbg.has_overall_minimal_spanning_range():
            coords = cbg.overall_minimal_spanning_range()
        else:
            return None, {}
    elif area == "MINSR":
        if cbg.has_minimal_spanning_range():
            coords = cbg.minimal_spanning_range()
        else:
            return None, {}
    elif area == "MAXSR":
        if cbg.has_maximal_spanning_range():
            coords = cbg.maximal_spanning_range()
        else:
            return None, {}
    elif area == "LEFTSPRDIF":
        if cbg.has_left_spanningrange_difference(**kwargs):
            coords = cbg.left_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "RIGTHSPRDIF":
        if cbg.has_rigth_spanningrange_difference(**kwargs):
            coords = cbg.rigth_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "OMSRANDLEFTSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_left_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.left_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords, verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node, coordrange in coords.iteritems():
            coords[node] = Set(range(min(coordrange), max(omsr[node]) + 1))
    elif area == "OMSRANDRIGTHSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_rigth_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.rigth_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords, verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node, coordrange in coords.iteritems():
            coords[node] = Set(range(min(omsr[node]), max(coordrange) + 1))
    elif area == "RIGTHORFEND":
        # area in between MAXSR and orfend
        if not cbg.has_maximal_spanning_range(): return None, {}
        # get coords & obtain Orf ends
        coords = cbg.maximal_spanning_range()
        nodes = coords.keys()
        for node in nodes:
            organism = cbg.organism_by_node(node)
            theorf = cbg.get_orfs_of_graph(organism=organism)[0]
            coords[node] = range(max(coords[node]) + 1, theorf.protein_endPY)
            # remove zero-length ranges
            if len(coords[node]) == 0: del (coords[node])
    else:
        raise "WHAT ELSE!?"

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # decrease coord range by prevcbg if applicable
    if area in ["MAXSR", "LEFTSPRDIF", "OMSRANDLEFTSPRDIF"] and prevcbg:
        omsr = prevcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection(prevcbg.organism_set()):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg = cbg.get_organism_nodes(org)[0]
            nodePrev = prevcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodePrev): continue
            sta = max([max(omsr[nodePrev]) + 1, min(coords[nodeCbg])])
            end = max(coords[nodeCbg]) + 1
            coords[nodeCbg] = Set(range(sta, end))
            if not coords[nodeCbg]: del (coords[nodeCbg])

    # decrease coord range by nextcbg if applicable
    if area in ["MAXSR", "RIGTHSPRDIF", "OMSRANDRIGTHSPRDIF"] and nextcbg:
        omsr = nextcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection(nextcbg.organism_set()):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg = cbg.get_organism_nodes(org)[0]
            nodeNext = nextcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodeNext): continue
            sta = min(coords[nodeCbg])
            end = min([min(omsr[nodeNext]), max(coords[nodeCbg]) + 1])
            coords[nodeCbg] = Set(range(sta, end))
            if not coords[nodeCbg]: del (coords[nodeCbg])

    # check if coords still present
    if not coords: return None, {}

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # do/redo _remove_short_sprdif_contributors id required
    if area in [
            "MAXSR", "LEFTSPRDIF", "RIGTHSPRDIF", "OMSRANDLEFTSPRDIF",
            "OMSRANDRIGTHSPRDIF", "RIGTHORFEND"
    ]:
        coords = _remove_short_sprdif_contributors(coords)

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # check if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # check sprdif_min_aa_length if applicable
    if area in [
            "RIGTHSPRDIF", "LEFTSPRDIF", "OMSRANDRIGTHSPRDIF",
            "OMSRANDLEFTSPRDIF"
    ]:
        maxlength = max([len(vlist) for vlist in coords.values()])
        if maxlength < kwargs['sprdif_min_aa_length']:
            return None, {}

    # if here, obtain sequences and build HMM search profile

    # get fasta sequences and
    fastaseqs = cbg._get_sequences_by_coords(coords)

    # rewrite dict (node) keys to string keys
    fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords)

    # remove empty sequence strings from fastaseqs dict
    empty_seq_keys = []
    for k, seq in fastaseqs.iteritems():
        if seq == "" or len(seq) == 1:
            empty_seq_keys.append(k)
    for k in empty_seq_keys:
        del (coords[k])
        del (fastaseqs[k])

    # check (again) if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # rewrite coords to (min,max) tuple
    coords = dict([(key, [min(vlist), max(vlist) + 1])
                   for key, vlist in coords.iteritems()])

    # perform clustalw multiple alignment
    (alignedseqs, alignment) = clustalw(seqs=fastaseqs)

    # strip exterior gaps in case of OMSR/MINSR area
    if area in ["OMSR", "MINSR"]:
        alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))

    # strip poorly conserved residues in case of RIGTHORFEND
    if area in ["RIGTHORFEND"]:
        alignedseqs, alignment, coords = strip_poorly_supported_tails(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords), 0.20)

    # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID!
    if strip_nonaligned_residues:
        alignedseqs, alignment, coords = strip_overall_nonaligned_residues(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))
        # check if alignment was completely consumed or not
        if not alignment or len(alignment) <= 1:
            return None, {}

    ############################################################################
    if verbose:
        print "## HMM clustalw input profile:", prevcbg != None, area, nextcbg != None
        for node, algseq in alignedseqs.iteritems():
            print algseq, node, coords[node]
        print alignment
    ############################################################################

    # make unique filename for hmm profile file
    fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag()

    # write multiple alignment input file
    writeMultiFasta(alignedseqs, fname_hmm_profile)

    # make hmmbuild file of the multiplealignment
    fname_hmmbuild_file = hmmbuild_protein(fname_hmm_profile)

    # remove hmm profile multiple alignment file
    osRemove(fname_hmm_profile)

    # return HMM serach profile filename
    return fname_hmmbuild_file, coords
Exemple #7
0
def cexpanderanalyses(cbg,min_cols=0,projected_on=":::",
    output='binary',cbgregion='omsr',verbose=False):
    """
    Run cexpander and get the CexpanderOutput object of this CBG
    
    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph

    @type  min_cols: DEPRECATED integer
    @param min_cols: DEPRECATED default 0, only change when expert user!

    @type  projected_on: DEPRECATED string
    @param projceted_on: DEPRECATED default ':::', only change when expert user!

    @type  output: string
    @param output: one of 'binary', 'float' (default 'binary')

    @type  cbgregion: string
    @param cbgregion: one of 'omsr', 'maxsr', 'omsr2orfend' (default 'omsr')

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype  cxpdrOutput: CexpanderOutput
    @return cxpdrOutput: CexpanderOutput object of this CBG

    """
    # create (~unique) basefname
    nodestringlist = []
    for node in cbg.get_ordered_nodes():
        nodestringlist.append( "%s%s" % (node[0],node[1]) )
    fname_fasta = "cbg_" + "".join(nodestringlist) + ".fa"

    if cbg.node_count() > 2:
        # write multi fasta of OMSR sequences
        if cbgregion == 'maxsr':
            fastaseqs= cbg.getmaxsrproteinsequences()
        elif cbgregion == 'omsr2orfend':
            fastaseqs= cbg.getomsr2orfendproteinsequences()
            # append dummy W amino acid that mimics the
            # alignment of STOP codons
            for k,v in fastaseqs.iteritems():
                fastaseqs[k] = v+"W"
        else:
            # omsr or Non-exstsing keyword...
            fastaseqs= cbg.getomsrproteinsequences()
        writeMultiFasta( fastaseqs , fname_fasta )
        if projected_on == ":::":
            pass
        elif not projected_on:
            strongestN   = cbg.strongest_connected_node()
            strongestO   = cbg.organism_by_node(strongestN)
            projected_on = strongestO
        elif projected_on in cbg.organism_set():
            pass
        else:
            raise OrganismNotPresentInGraph
    
        # get cxpdrOutput object; file-cleanup is taken care for
        cxpdrOutput = runcexpander(fname_fasta,
                cbalignp_commandline = " -y", output=output)

        # correct hard-added 'W' residue in cexpander OMSR2ORFEND 
        if cbgregion == 'omsr2orfend':
            IS_FIRST = True
            for trf in cxpdrOutput._transferblocks: 
                trf.positions-=1
                if trf.binarystring[-1] == "1":
                    trf.score-=1
                trf.binarystring = trf.binarystring[0:-1]
                trf.ratio        = trf._binarystring2matchratio(trf.binarystring)
                if IS_FIRST:
                    cxpdrOutput.set_transferblock(trf.header)
                    IS_FIRST = False
            for k,seq in cxpdrOutput.sequences.iteritems():
                cxpdrOutput.sequences[k] = seq[0:-1]
        # EOF correct hard-added 'W' residue in cexpander OMSR2ORFEND

    else:
        # weird case of CBG with only 2 nodes;
        # can happen when ``weakest organism`` is removed
        # from the GSG based on GTG analyses
        # Fake cexpander output here
        pacbporf = cbg.pacbps.values()[0]
        bstring  = "1" *( pacbporf._original_alignment_pos_end -\
                          pacbporf._original_alignment_pos_start )
        cxpdrOutput = CexpanderOutput()
        cxpdrOutput.binarystring = bstring
        cxpdrOutput.header = cbg.organism_by_node(
                cbg.get_ordered_nodes()[0])
        cxpdrOutput.positions = len(bstring)
        cxpdrOutput.score = len(bstring)

    # (2) return the output object
    return cxpdrOutput
Exemple #8
0
def blastanalysescbgjunction(
        gsg,
        prevCBG,
        nextCBG,
        omit_cbg_orfs=False,
        omit_non_cbg_orfs=False,
        extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
        omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
        verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org, orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf
        for org, orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,
            prevCBG,
            nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
        ).split("\n"))

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org, orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org, orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
                del (fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del (fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):", len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return []

    # check if all organisms are still covered
    orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return []

    # create !single! fasta database
    fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa"
    writeMultiFasta(fastadbmfa, fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ, orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ + "_orf_" + str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       protseq,
                                       fastadbname,
                                       extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS, _orfSid = alignment.title.replace(">",
                                                         "").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS, int(_orfSid))
                orfS = orfs[nodeS]

                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return []

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print[p.bitscore for p in dpcpacbpcol.pacbps.values()]
        print "PCG nodes:", dpcpacbpcol.get_ordered_nodes()
    ################################################################

    #### do some transformations on the pacbpcol
    ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1)
    ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
    ####        edges=gsg.node_count()-1 , max_missing_edges=0 )
    ##### convert to list of CBGs and do some transformations
    ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    ####cbgList.remove_all_but_complete_cbgs()
    ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    ####cbgList.remove_cbgs_without_omsr()
    ####cbgList.update_edge_weights_by_minimal_spanning_range()
    ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2])
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity)
    max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.make_pacbps_for_missing_edges()
    cbgList.remove_all_but_complete_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    # and create_cache() for these CBGs
    for cbg in cbgList:
        cbg.create_cache()

    ####################################################################
    if verbose:
        print stw.lap(), "CBGs created", len(cbgList)
        for newcbg in cbgList:
            print "new:", newcbg
    ####################################################################

    # return list with CBGs
    return cbgList.codingblockgraphs
Exemple #9
0
def get_reverse_cbg(cbg, frame, verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength() / 2) * 3
    orfs = get_reverse_strand_orfsets(cbg,
                                      frame,
                                      min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[len(o.protein_sequence) for o in orfs[org].orfs]
        ########################################################################

    revpacbps = {}
    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(
                    alignment.title.replace(">", ""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength() / 2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = (orgQ, orfQ.protein_startPY)
                nodeS = (orgS, orfS.protein_startPY)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
            cbgList.codingblockgraphs[0])
Exemple #10
0
def _create_hmm_profile(cbg,area="OMSR",prevcbg=None,nextcbg=None,
    strip_nonaligned_residues=False,
    verbose=False,**kwargs):
    """
    """
    # area must be one of 
    # OMSR MINSR MAXSR
    # LEFTSPRDIF RIGTHSPRDIF
    # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF
    # RIGTHORFEND

    # update to default value
    if not kwargs.has_key('sprdif_min_aa_length'):
        kwargs['sprdif_min_aa_length'] = 20

    if area == "OMSR":
        if cbg.has_overall_minimal_spanning_range():
            coords = cbg.overall_minimal_spanning_range()
        else:
            return None, {}
    elif area == "MINSR":
        if cbg.has_minimal_spanning_range():
            coords = cbg.minimal_spanning_range()
        else:
            return None, {}
    elif area == "MAXSR":
        if cbg.has_maximal_spanning_range():
            coords = cbg.maximal_spanning_range()
        else:
            return None, {}
    elif area == "LEFTSPRDIF":
        if cbg.has_left_spanningrange_difference(**kwargs):
            coords = cbg.left_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "RIGTHSPRDIF":
        if cbg.has_rigth_spanningrange_difference(**kwargs):
            coords = cbg.rigth_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "OMSRANDLEFTSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_left_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.left_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords,verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node,coordrange in coords.iteritems():
            coords[node] = Set( range( min(coordrange), max(omsr[node])+1 ) )
    elif area == "OMSRANDRIGTHSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_rigth_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.rigth_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords,verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node,coordrange in coords.iteritems():
            coords[node] = Set( range( min(omsr[node]), max(coordrange)+1 ) )
    elif area == "RIGTHORFEND":
        # area in between MAXSR and orfend
        if not cbg.has_maximal_spanning_range(): return None, {}
        # get coords & obtain Orf ends
        coords = cbg.maximal_spanning_range()
        nodes = coords.keys()
        for node in nodes:
            organism = cbg.organism_by_node(node)
            theorf = cbg.get_orfs_of_graph(organism=organism)[0]
            coords[node] = range(max(coords[node])+1,theorf.protein_endPY)
            # remove zero-length ranges
            if len(coords[node]) == 0: del(coords[node])
    else:
        raise "WHAT ELSE!?"

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # decrease coord range by prevcbg if applicable
    if area in ["MAXSR","LEFTSPRDIF","OMSRANDLEFTSPRDIF"] and prevcbg:
        omsr = prevcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection( prevcbg.organism_set() ):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg  = cbg.get_organism_nodes(org)[0]
            nodePrev = prevcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodePrev): continue
            sta = max( [ max(omsr[nodePrev])+1, min(coords[nodeCbg]) ] )
            end = max(coords[nodeCbg])+1
            coords[nodeCbg] = Set(range(sta,end))
            if not coords[nodeCbg]: del( coords[nodeCbg] )

    # decrease coord range by nextcbg if applicable
    if area in ["MAXSR","RIGTHSPRDIF","OMSRANDRIGTHSPRDIF"] and nextcbg:
        omsr = nextcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection( nextcbg.organism_set() ):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg  = cbg.get_organism_nodes(org)[0]
            nodeNext = nextcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodeNext): continue
            sta = min(coords[nodeCbg])
            end = min( [ min(omsr[nodeNext]), max(coords[nodeCbg])+1 ] )
            coords[nodeCbg] = Set(range(sta,end))
            if not coords[nodeCbg]: del( coords[nodeCbg] )

    # check if coords still present
    if not coords: return None, {}

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # do/redo _remove_short_sprdif_contributors id required
    if area in ["MAXSR","LEFTSPRDIF","RIGTHSPRDIF",
    "OMSRANDLEFTSPRDIF","OMSRANDRIGTHSPRDIF","RIGTHORFEND"]:
        coords = _remove_short_sprdif_contributors(coords)

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # check if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # check sprdif_min_aa_length if applicable
    if area in ["RIGTHSPRDIF","LEFTSPRDIF","OMSRANDRIGTHSPRDIF",
    "OMSRANDLEFTSPRDIF"]:
        maxlength = max([ len(vlist) for vlist in coords.values() ])
        if maxlength < kwargs['sprdif_min_aa_length']:
            return None, {}

    # if here, obtain sequences and build HMM search profile

    # get fasta sequences and 
    fastaseqs = cbg._get_sequences_by_coords(coords)

    # rewrite dict (node) keys to string keys
    fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords)

    # remove empty sequence strings from fastaseqs dict
    empty_seq_keys = []
    for k,seq in fastaseqs.iteritems():
        if seq == "" or len(seq) == 1:
            empty_seq_keys.append(k)
    for k in empty_seq_keys:
        del(coords[k])
        del(fastaseqs[k])

    # check (again) if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # rewrite coords to (min,max) tuple
    coords = dict([ (key,[min(vlist),max(vlist)+1]) for key,vlist in coords.iteritems() ])

    # perform clustalw multiple alignment
    (alignedseqs,alignment) = clustalw( seqs= fastaseqs )


    # strip exterior gaps in case of OMSR/MINSR area
    if area in ["OMSR","MINSR"]:
        alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(
                deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )


    # strip poorly conserved residues in case of RIGTHORFEND
    if area in ["RIGTHORFEND"]:
        alignedseqs,alignment,coords = strip_poorly_supported_tails(
            deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords),0.20 )


    # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID!
    if strip_nonaligned_residues:
        alignedseqs,alignment,coords = strip_overall_nonaligned_residues(
                deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )
        # check if alignment was completely consumed or not
        if not alignment or len(alignment) <= 1:
            return None, {}


    ############################################################################
    if verbose:
        print "## HMM clustalw input profile:",prevcbg!=None,area,nextcbg!=None
        for node,algseq in alignedseqs.iteritems():
            print algseq, node, coords[node]
        print alignment
    ############################################################################

    # make unique filename for hmm profile file
    fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag()

    # write multiple alignment input file
    writeMultiFasta(alignedseqs,fname_hmm_profile)

    # make hmmbuild file of the multiplealignment
    fname_hmmbuild_file = hmmbuild_protein( fname_hmm_profile )

    # remove hmm profile multiple alignment file
    osRemove(fname_hmm_profile)

    # return HMM serach profile filename
    return fname_hmmbuild_file, coords
Exemple #11
0
def cexpanderanalyses(cbg,
                      min_cols=0,
                      projected_on=":::",
                      output='binary',
                      cbgregion='omsr',
                      verbose=False):
    """
    Run cexpander and get the CexpanderOutput object of this CBG
    
    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph

    @type  min_cols: DEPRECATED integer
    @param min_cols: DEPRECATED default 0, only change when expert user!

    @type  projected_on: DEPRECATED string
    @param projceted_on: DEPRECATED default ':::', only change when expert user!

    @type  output: string
    @param output: one of 'binary', 'float' (default 'binary')

    @type  cbgregion: string
    @param cbgregion: one of 'omsr', 'maxsr', 'omsr2orfend' (default 'omsr')

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype  cxpdrOutput: CexpanderOutput
    @return cxpdrOutput: CexpanderOutput object of this CBG

    """
    # create (~unique) basefname
    nodestringlist = []
    for node in cbg.get_ordered_nodes():
        nodestringlist.append("%s%s" % (node[0], node[1]))
    fname_fasta = "cbg_" + "".join(nodestringlist) + ".fa"

    if cbg.node_count() > 2:
        # write multi fasta of OMSR sequences
        if cbgregion == 'maxsr':
            fastaseqs = cbg.getmaxsrproteinsequences()
        elif cbgregion == 'omsr2orfend':
            fastaseqs = cbg.getomsr2orfendproteinsequences()
            # append dummy W amino acid that mimics the
            # alignment of STOP codons
            for k, v in fastaseqs.iteritems():
                fastaseqs[k] = v + "W"
        else:
            # omsr or Non-exstsing keyword...
            fastaseqs = cbg.getomsrproteinsequences()
        writeMultiFasta(fastaseqs, fname_fasta)
        if projected_on == ":::":
            pass
        elif not projected_on:
            strongestN = cbg.strongest_connected_node()
            strongestO = cbg.organism_by_node(strongestN)
            projected_on = strongestO
        elif projected_on in cbg.organism_set():
            pass
        else:
            raise OrganismNotPresentInGraph

        # get cxpdrOutput object; file-cleanup is taken care for
        cxpdrOutput = runcexpander(fname_fasta,
                                   cbalignp_commandline=" -y",
                                   output=output)

        # correct hard-added 'W' residue in cexpander OMSR2ORFEND
        if cbgregion == 'omsr2orfend':
            IS_FIRST = True
            for trf in cxpdrOutput._transferblocks:
                trf.positions -= 1
                if trf.binarystring[-1] == "1":
                    trf.score -= 1
                trf.binarystring = trf.binarystring[0:-1]
                trf.ratio = trf._binarystring2matchratio(trf.binarystring)
                if IS_FIRST:
                    cxpdrOutput.set_transferblock(trf.header)
                    IS_FIRST = False
            for k, seq in cxpdrOutput.sequences.iteritems():
                cxpdrOutput.sequences[k] = seq[0:-1]
        # EOF correct hard-added 'W' residue in cexpander OMSR2ORFEND

    else:
        # weird case of CBG with only 2 nodes;
        # can happen when ``weakest organism`` is removed
        # from the GSG based on GTG analyses
        # Fake cexpander output here
        pacbporf = cbg.pacbps.values()[0]
        bstring  = "1" *( pacbporf._original_alignment_pos_end -\
                          pacbporf._original_alignment_pos_start )
        cxpdrOutput = CexpanderOutput()
        cxpdrOutput.binarystring = bstring
        cxpdrOutput.header = cbg.organism_by_node(cbg.get_ordered_nodes()[0])
        cxpdrOutput.positions = len(bstring)
        cxpdrOutput.score = len(bstring)

    # (2) return the output object
    return cxpdrOutput
Exemple #12
0
def blastanalysescbgjunction(gsg,prevCBG,nextCBG,
    omit_cbg_orfs = False,
    omit_non_cbg_orfs = False,
    extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
    omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
    verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org,orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf
        for org,orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,prevCBG,nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
            ).split("\n")
        )

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org,orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org,orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
               del(fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del(fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):",len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return [] 

    # check if all organisms are still covered
    orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return [] 

    # create !single! fasta database
    fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa"
    writeMultiFasta(fastadbmfa,fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ,orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ+"_orf_"+str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname,
                    extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS,_orfSid = alignment.title.replace(">","").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS,int(_orfSid))
                orfS  = orfs[nodeS]
               
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = ( orgQ, orfQ.id )
                nodeS = ( orgS, orfS.id )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return [] 

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]