Esempio n. 1
0
def exononorfs2pacbporf(exonQ, exonS, matrix=None):
    """
    Create a PacbPORF object from 2 ExonOnOrf objects

    @type  exonQ: ExonOnOrf object
    @param exonQ: ExonOnOrf object (query)

    @type  exonS: ExonOnOrf object
    @param exonS: ExonOnOrf object (sbjct)

    @attention: exonQ and exonS proteinsequence() MUST be identical in length!

    @rtype:  pacb.PacbPORF instance
    @return: pacb.PacbPORF instance
    """
    # prepare input data
    query = exonQ.proteinsequence()
    sbjct = exonS.proteinsequence()
    qaas = exonQ.orf.dnapos2aapos(exonQ.start)
    saas = exonS.orf.dnapos2aapos(exonS.start)
    # make pacbp and then pacbporf object
    if matrix:
        pacbpobj = pacb.PacbP(input=(query, sbjct, qaas, saas), MATRIX=matrix)
    else:
        pacbpobj = pacb.PacbP(input=(query, sbjct, qaas, saas))
    if pacbpobj.length == 0:
        return None
    pacbporfobj = pacbp2pacbporf(pacbpobj, exonQ.orf, exonS.orf)
    pacbporfobj.extend_pacbporf_after_stops()
    return pacbporfobj
Esempio n. 2
0
def pacbp_from_clustalw(alignment=(), coords=()):
    """
    Create a PacbP object from a ClustalW alignment

    @type  alignment: tuple (of 3 strings)
    @param alignment: (query,match,sbjct) in !ALIGNED! format (indentical string lengths with gaps)

    @type  coords: tuple (of 4 integers)
    @param coords: (qaastart,qaaend,saastart,saaend) integers, AA-coords!

    @rtype:  pacb.PacbP instance
    @return: pacb.PacbP instance
    """
    query, match, sbjct = alignment
    qaastart, qaaend, saastart, saaend = coords
    # trim leading and tailing gaps in the alignment
    while query and query[0] == '-':
        query = query[1:]
        match = match[1:]
        sbjct = sbjct[1:]
        saastart += 1
    while sbjct and sbjct[0] == '-':
        query = query[1:]
        match = match[1:]
        sbjct = sbjct[1:]
        qaastart += 1
    while query and query[-1] == '-':
        query = query[0:-1]
        match = match[0:-1]
        sbjct = sbjct[0:-1]
        saaend -= 1
    while sbjct and sbjct[-1] == '-':
        query = query[0:-1]
        match = match[0:-1]
        sbjct = sbjct[0:-1]
        qaaend -= 1
    # trim non-aligned characters as well!
    while match and match[0] == ' ':
        query = query[1:]
        match = match[1:]
        sbjct = sbjct[1:]
        qaastart += 1
        saastart += 1
    while match and match[-1] == ' ':
        query = query[0:-1]
        match = match[0:-1]
        sbjct = sbjct[0:-1]
        qaaend -= 1
        saaend -= 1

    if query and sbjct:
        # make a PacbPORF of this alignment
        pacbpinput = (query, sbjct, qaastart, saastart)
        pacbp = pacb.PacbP(input=pacbpinput)
        pacbp.source = 'clustalw'
    else:
        # no proper clustalw alignable sequences -> no PacbP
        pacbp = None
    # return the created pacbp
    return pacbp
Esempio n. 3
0
def pacbporf2pacbp(pacbporf):
    """
    Convert PacbPORF object (backwards to) a PacbP object

    @type  pacbporf: pacb.PacbPORF instance
    @param pacbporf: pacb.PacbPORF instance 

    @rtype:  pacb.PacbP instance
    @return: pacb.PacbP instance 
    """
    if str(pacbporf.__class__) == 'pacb.PacbP':
        # object is already a PacbP, not a PacbPORF
        return pacbporf
    elif str(pacbporf.__class__) == 'pacb.pacbp.PacbP':
        # object is already a PacbP, not a PacbPORF
        return pacbporf
    else:
        # object isa PacbPDNA or PacbPDNA; convert to PacbP
        staPos = pacbporf._get_original_alignment_pos_start()
        endPos = pacbporf._get_original_alignment_pos_end()
        query = pacbporf.query[staPos.position:endPos.position + 1]
        sbjct = pacbporf.sbjct[staPos.position:endPos.position + 1]
        pacbpinput = (query, sbjct, staPos.query_pos, staPos.sbjct_pos)
        pacbpobj = pacb.PacbP(input=pacbpinput, MATRIX=pacbporf.MATRIX)
        # update required attributes
        pacbpobj.source = pacbporf.source
        pacbpobj._IS_DELETE_PROTECTED = pacbporf._IS_DELETE_PROTECTED
        pacbpobj._IS_EDIT_PROTECTED = pacbporf._IS_EDIT_PROTECTED
        # return the object
        return pacbpobj
Esempio n. 4
0
def get_frameshifted_cbg(cbg, input, verbose=True):
    """
    Get a CBG with frameshifts (in some of if Orfs) compared to this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to check for frameshifts

    @type  input: dict
    @param input: input <dict data structure> with lists of Orfs

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  CodingBlockGraph or None
    @return: CodingBlockGraph (when existing) or None
    """

    # get elegiable lists of Orfs
    orfs = _get_elegiable_frameshift_orfsets(cbg, input)

    # check how many Orfs are elgiable...
    if sum([len(l.orfs) for l in orfs.values()]) == cbg.node_count():
        # no frameshift possible here...
        return None

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        # REMAP fastaheaders as ids to retrieve the Orfs after blast..
        for orf in orfs[org].orfs:
            orf.fastaheader = str(orf.id)
        fname = "%s_frameshiftcbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[orf.id for orf in orfs[org].orfs],
            print[str(orf) for orf in orfs[org].orfs]
        ########################################################################

    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfid = alignment.title.replace(">", "").split(" ")[0].replace(
                    "_", "")
                orfS = orfs[orgS].get_orf_by_id(int(orfid))

                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                if nodeQ in cbg.get_nodes() and nodeS in cbg.get_nodes():
                    pacbporf = cbg.get_pacbps_by_nodes(node1=nodeQ,
                                                       node2=nodeS)[0]

                else:
                    # take only the *best* HSP (highest scoring first one)
                    hsp = alignment.hsps[0]

                    # correct to absolute positions
                    hsp.query_start = hsp.query_start + orfQ.protein_startPY
                    hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                    # initialize the PacbP
                    pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)
                    ############################################################
                    if verbose: print "NEW:", pacbporf
                    ############################################################

                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        ############################################################
        if verbose: print "org_set_size() PCG < CBG"
        ############################################################
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(cbg.node_count())
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_graphlist_by_total_weight_and_identity()

    ############################################################################
    if verbose:
        print "FScbgs (%s)" % len(cbgList)
        for fscbg in cbgList:
            print fscbg
    ############################################################################

    if not cbgList:
        # no (better) frameshifted CBG
        return None
    elif cbgList and not cbgList[0].node_set().symmetric_difference(
            cbg.node_set()):
        # best CBG is not frameshifted, but CBG itself
        return None
    else:
        # score the difference between the frameshifted and current CBG
        score_cbg = cbg.total_weight() * cbg.omsr_identityscore()
        score_fscbg = cbgList[0].total_weight(
        ) * cbgList[0].omsr_identityscore()
        # check overlap between the frameshifted and current CBG
        a, b, c, d, e, f, g = relatively_positioned_towards(cbgList[0], cbg)

        ########################################################################
        if verbose:
            print "CBG", cbg
            cbg.printmultiplealignment()
            for fscbg in cbgList:
                print "fsCBG:", fscbg
                fscbg.printmultiplealignment()
        ########################################################################

        if (c, d) == ((0, 0, 1), (1, 0, 0)) or (c, d) == ((0, 0, 1),
                                                          (1, 0, 0)):
            # CBG and frameshifted CBG do not share a single AA overlap...
            # This does not represent a frameshifted CBG as we searched for
            return False
        elif score_fscbg > score_cbg:
            # return the highest scoring, frameshifted CBG
            return cbgList[0]
        else:
            # no, still not convinced that this is a frameshifted CBG
            return False
Esempio n. 5
0
def _blastorfset2blastdb(geneQ,
                         geneS,
                         blastdbfname,
                         input,
                         crossdata,
                         GSgraph,
                         blastoptions=None,
                         elegiable_orfsQ=[],
                         elegiable_orfsS_ids=[],
                         logging=False):
    """
    """
    hitcnt = 0
    for orfQ in elegiable_orfsQ:
        # check if protein sequence present in Orf object
        # in obscure cases of unigenes, no protein sequence present!
        if not orfQ.protein_sequence: continue

        # make unique node identifier and blast header
        nodeQ = (geneQ, orfQ.id)
        header = "%s_orf_%s" % (geneQ, orfQ.id)

        # do the blastp!
        blastrec = blastall_seq2db(
            header,
            orfQ.protein_sequence,
            dbname=blastdbfname,
            extra_blastp_params=blastoptions.extra_blastp_params)

        # check if blast failed (then, blastrec == False)
        if not blastrec: continue

        # check if there are any hits/hsps!
        if len(blastrec.alignments) == 0:
            # no hits; continue
            continue

        for alignment in blastrec.alignments:
            # get back orfpointerB from the SBJCT and create nodeS
            _parts = alignment.title.split("_")
            geneS = "_".join(_parts[0:-2]).replace('>', '')
            _orfpointerS = int(_parts[-1])
            nodeS = (geneS, _orfpointerS)

            # ignore hit if nodeS orfid not occurring in the NON-empty list elegiable_orfsS_ids
            if elegiable_orfsS_ids and _orfpointerS not in elegiable_orfsS_ids:
                continue

            # get the Orf object of this sbjct sequence
            orfS = input[geneS]['orfs'].get_orf_by_id(_orfpointerS)

            # loop over the HSPs
            for hsp in alignment.hsps:

                # If hits are really tiny (happens in case of BLOSUM45 matrix),
                # discard them directly before precious time is lost...
                if len(hsp.query
                       ) <= blastoptions.BLASTP_DIRECTLY_IGNORE_TINY_HITS:
                    continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY
                hsp.query_end = hsp.query_end + orfQ.protein_startPY
                hsp.sbjct_end = hsp.sbjct_end + orfS.protein_startPY

                # VERY exceptional case: HSP starts or ends with a gap
                # I expect this is an error in Blastp ....
                strip_exterior_gaps(hsp)

                if hsp.query.find(" ") > 0:
                    # VERY exceptional case: erroneously NCBI parsed HSP:
                    # Score 8 (7 bits), expectation 1.7e+01, alignment length 41
                    # Query:    1622 STHTYDAC                                                TRCI----PFVDTGHKHENPTEALLDSTA 1654
                    #                TR      P  +  H +  P+++   S++
                    # Sbjct:     489 TEHIYLHT                                                TRSTWPPKPPTNASHANTKPSKSHHRSSS 525
                    if len(hsp.query.split(" ")[-1]) == len(hsp.match):
                        hsp.query = hsp.query.split(" ")[-1]
                        hsp.sbjct = hsp.sbjct.split(" ")[-1]
                    elif len(hsp.query.split(" ")[0]) == len(hsp.match):
                        hsp.query = hsp.query.split(" ")[0]
                        hsp.sbjct = hsp.sbjct.split(" ")[0]
                    elif len(hsp.query) == len(hsp.match):
                        # spaces in both query/match/sbjct
                        while hsp.query.find(" ") > 0:
                            pos = hsp.query.find(" ")
                            if hsp.sbjct[pos] == " " and hsp.match[pos] == " ":
                                hsp.query = hsp.query[0:pos] + hsp.query[pos +
                                                                         1:]
                                hsp.match = hsp.match[0:pos] + hsp.match[pos +
                                                                         1:]
                                hsp.sbjct = hsp.sbjct[0:pos] + hsp.sbjct[pos +
                                                                         1:]
                            else:
                                # HPS is not repairable -> quit trying
                                break
                    elif len(hsp.query) == len(hsp.sbjct):
                        while hsp.query.find(" ") > 0:
                            pos = hsp.query.find(" ")
                            hsp.query = hsp.query[0:pos] + hsp.query[pos + 1:]
                            hsp.sbjct = hsp.sbjct[0:pos] + hsp.sbjct[pos + 1:]
                        # recreate alignment match string is done upon
                        # creation of PacbP object

                    else:
                        pass

                # VERY exceptional case: HSP starts or ends with a gap
                # I expect this is an error in Blastp ....
                strip_exterior_gaps(hsp)
                try:
                    pacbp = pacb.PacbP(blastp_hsp=hsp,
                                       MATRIX=blastoptions.MATRIX)
                except:
                    # VERY exceptional miscelaneous cases: erroneously NCBI parsed HSP:
                    print hsp
                    print "'%s' X" % hsp.query, len(
                        hsp.query), hsp.query_start, hsp.query_end
                    print "'%s' X" % hsp.match, len(hsp.match)
                    print "'%s' X" % hsp.sbjct, len(
                        hsp.sbjct), hsp.sbjct_start, hsp.sbjct_end
                    pacbp = pacb.PacbP(blastp_hsp=hsp,
                                       MATRIX=blastoptions.MATRIX)

                # make pacbp of this hsp
                pacbp = pacb.PacbP(blastp_hsp=hsp, MATRIX=blastoptions.MATRIX)

                # if logging is requested for, print this pacbp to STDOUT
                if logging:
                    print ">>> Q", nodeQ, orfQ.tcode_symbolic(
                    ), "S", nodeS, orfS.tcode_symbolic(
                    ), pacbp, blastoptions.MATRIX.name, hsp.expect, hsp.bits
                    print ">>>", blastoptions.extra_blastp_params
                    if pacbp.length > 100:
                        print pacbp.query[0:40] + '.' * 7 + str(
                            pacbp.length - 80) + '.' * 7 + pacbp.query[-40:]
                        print pacbp.match[0:40] + '.' * 7 + str(
                            pacbp.length - 80) + '.' * 7 + pacbp.match[-40:]
                        print pacbp.sbjct[0:40] + '.' * 7 + str(
                            pacbp.length - 80) + '.' * 7 + pacbp.sbjct[-40:]
                    else:
                        print pacbp.query
                        print pacbp.match
                        print pacbp.sbjct

                # blastoptions.BLASTP_HSP_MINIMAL_LENGTH represents the minimal
                # length of the aligned part. (To) short pacbp's are abandoned
                if pacbp.length < blastoptions.BLASTP_HSP_MINIMAL_LENGTH:
                    if pacbp.identityscore == float(pacbp.length):
                        # escape for 100% identical tiny pacbps
                        pass
                    elif pacbp.identity + pacbp.similarity == pacbp.length:
                        # escape for 100% similar tiny pacbps
                        pass
                    else:
                        #  pacbp is to small. Discard!
                        if logging: print "to small..."
                        continue

                # check if the pacbp is not conflicting with the currect GSG graph
                # if so, ignore now because it will not yield a proper edge in an (accepted) CBG!
                if GSgraph and len(
                        GSgraph
                ) and GSgraph.is_pacbp_conflicting_with_genestructure(
                        pacbp, orgQ=geneQ, orgS=geneS):
                    ###print "GSGconflict!", nodeQ,nodeS, GSgraph.is_pacbp_conflicting_with_genestructure(pacbp,orgQ=geneQ,orgS=geneS), len(pacbp)
                    continue

                # here we have a potentially accepted pacbp.
                # make a/the unique key of this pacbp
                key = (pacbp.bits, pacbp.length, orfQ.id, _orfpointerS)

                # check for evalue criterion
                if (blastoptions.BLASTP_HSP_MAXIMAL_EXPECT
                        or blastoptions.BLASTP_HSP_MAXIMAL_EXPECT == 0.0
                    ) and hsp.expect > blastoptions.BLASTP_HSP_MAXIMAL_EXPECT:
                    # pacbp is long enough but has a to high evalue
                    crossdata[(geneQ, geneS)]['lowscoring_pacbs'][key] = pacbp
                    if logging: print "to low bitscore or expect"
                    continue

                # check for bitscore criterion
                if (blastoptions.BLASTP_HSP_MINIMAL_BITS
                        or blastoptions.BLASTP_HSP_MINIMAL_BITS == 0
                    ) and pacbp.bits < blastoptions.BLASTP_HSP_MINIMAL_BITS:
                    # pacbp is long enough but has a to low bitscore
                    crossdata[(geneQ, geneS)]['lowscoring_pacbs'][key] = pacbp
                    if logging: print "to low bitscore or expect"
                    continue

                # !!Hurray!! an accepted pacbp. Store to crossdata
                # store it to the 'accepted_pacbs' dict of crossdata
                crossdata[(geneQ, geneS)]['accepted_pacbs'][key] = pacbp
                hitcnt += 1
                if logging: print "ACCEPTED"
                # done -> check next orf!

    # return the filled crossdata structure
    return crossdata, hitcnt
Esempio n. 6
0
def find_intermediary_codingblockgraph_with_tinyexon(graphL,graphR,input={},similaritymatrix=None,min_bitscore_ratio=0.3):
    """
    """
    tinyexon_crossdata = {}
    tinyexons_seen = 0

    for org in graphL.organism_set():
        theOrfL = graphL.get_orfs_of_graph(organism=org)[0]
        theOrfR = graphR.get_orfs_of_graph(organism=org)[0]
        # continue if identical orfs
        # TODO: maybe check as well for spanning ranges?
        # TODO: in theory, a tinyorf can exist on this orf as well...
        if theOrfL.id == theOrfR.id: continue
        msrL = graphL.minimal_spanning_range(organism=org)
        msrR = graphR.minimal_spanning_range(organism=org)

        # check for get eligable donors on orfL and acceptors on orfR
        if org in graphL._splicedonorgraph.organism_set() and\
        org in graphR._spliceacceptorgraph.organism_set():
            eligable_donors    = graphL._splicedonorgraph.get_organism_objects(org)
            eligable_acceptors = graphR._spliceacceptorgraph.get_organism_objects(org)
            orflist            = input[org]['orfs'].orfs

            # search for tinyexons
            tinyexonlist = bridge_two_pacbporfs_by_tinyexon(theOrfL,theOrfR,
                    preceding_donor_sites= eligable_donors,
                    subsequent_acceptor_sites= eligable_acceptors,
                    orflist=orflist
                    )

            doubletinyexons = bridge_two_pacbporfs_by_two_tinyexons(theOrfL,theOrfR,
                    preceding_donor_sites= eligable_donors,
                    subsequent_acceptor_sites= eligable_acceptors,
                    orflist=orflist
                    )

        else:
            # not donors and acceptors on both orfs!
            return []


        # Order the tinyexons with respect to which orf they are located on.
        # for now, IGNORE tinyexons on the both left and right orf it self!!
        orf2tinyexons = {}
        for tinyexon in tinyexonlist:
            if tinyexon.orf.id in [ theOrfL.id, theOrfR.id ]:
                continue
            if orf2tinyexons.has_key(tinyexon.orf.id):
                orf2tinyexons[tinyexon.orf.id].append(tinyexon)
            else:
                orf2tinyexons[tinyexon.orf.id] = [ tinyexon ]

        # loop over the unique orfids on which tinyexons are predicted
        for orfid, telist in orf2tinyexons.iteritems():
            # loop over all other organisms (except the organism itself)
            for otherorg in graphL.organism_set():
                if otherorg == org: continue
                orgkey = [org,otherorg]
                orgkey.sort()
                _orgkey_reversed = False
                if orgkey != [org,otherorg]: _orgkey_reversed = True
                orgkey = tuple(orgkey)
                if not tinyexon_crossdata.has_key(orgkey):
                    tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} }
                orfL = graphL.get_orfs_of_graph(organism=otherorg)[0]
                orfR = graphR.get_orfs_of_graph(organism=otherorg)[0]

                # main list for all similarities on this orfid
                similaritiesL = []
                similaritiesR = []
                for tinyexon in telist:
                    # get protein query sequence from tinyorf
                    query_dna    = tinyexon.orf.inputgenomicsequence[tinyexon.acceptor.pos:tinyexon.donor.pos]
                    query        = dna2proteinbyframe(query_dna, (3 - tinyexon.acceptor.phase) % 3 )
                    query_aa_pos = tinyexon.acceptor.pos / 3

                    _similaritiesL = similaritymatrix.scansbjct(query,orfL.protein_sequence,min_bitscore_ratio=min_bitscore_ratio)
                    if orfL.id == orfR.id:
                        _similaritiesR = []
                    else:
                        _similaritiesR = similaritymatrix.scansbjct(query,orfR.protein_sequence,min_bitscore_ratio=min_bitscore_ratio)
                    # Append to all similarities on this orfid; append the tinyexon itself too
                    # in order to place the similarity back to a specific tinyexon.
                    # This is needed because there can be >1 tinyexon on the same orf...
                    _similaritiesL = [ (_data,tinyexon) for _data in _similaritiesL ]
                    _similaritiesR = [ (_data,tinyexon) for _data in _similaritiesR ]
                    similaritiesL.extend(_similaritiesL)
                    similaritiesR.extend(_similaritiesR)

                # re-order the similarities because they can contain data from 2 tinyexons (on the same orf)
                # ordering is performed on ``ratio * bitscore``
                # this - kind of - evalue calculation enables a preferation for longer matches
                similaritiesL = _order_similarities(similaritiesL)
                similaritiesR = _order_similarities(similaritiesR)

                # Now make pacbporfs of only the BEST tinyexon and its
                # similarity on another organism
                TAKE_BEST_SIMILARITIES = 2
                for ( ( ratio, sbjct_pos, q_seq, match, s_seq, bitscore), tinyexon ) in similaritiesL[0:TAKE_BEST_SIMILARITIES]:
                    sbjct_aa_pos = sbjct_pos+orfL.protein_startPY
                    query_aa_pos = tinyexon.acceptor.pos / 3
                    if _orgkey_reversed:
                        ###print s_seq, "'%s'" % match, ratio, orfL.id
                        pacbpkey = (bitscore, len(query), orfL.id, tinyexon.orf.id )
                        pacbp    = pacb.PacbP(input=(s_seq,q_seq,sbjct_aa_pos,query_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,orfL,tinyexon.orf)
                    else:
                        ###print q_seq, "'%s'" % match, ratio, orfL.id
                        pacbpkey = (bitscore, len(query), tinyexon.orf.id, orfL.id )
                        pacbp    = pacb.PacbP(input=(q_seq,s_seq,query_aa_pos,sbjct_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,tinyexon.orf,orfL)

                    tinyexons_seen+=1
                    pacbporf.extend_pacbporf_after_stops()
                    tinyexon_crossdata[orgkey]['accepted_pacbs'][pacbpkey] = pacbporf

                for ( ( ratio, sbjct_pos, q_seq, match, s_seq, bitscore), tinyexon ) in similaritiesR[0:TAKE_BEST_SIMILARITIES]:
                    sbjct_aa_pos = sbjct_pos+orfR.protein_startPY
                    query_aa_pos = tinyexon.acceptor.pos / 3
                    if _orgkey_reversed:
                        pacbpkey = (bitscore, len(query), orfR.id, tinyexon.orf.id )
                        pacbp    = pacb.PacbP(input=(s_seq,q_seq,sbjct_aa_pos,query_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,orfR,tinyexon.orf)
                    else:
                        pacbpkey = (bitscore, len(query), tinyexon.orf.id, orfR.id )
                        pacbp    = pacb.PacbP(input=(q_seq,s_seq,query_aa_pos,sbjct_aa_pos))
                        pacbporf = pacb.conversion.pacbp2pacbporf(pacbp,tinyexon.orf,orfR)

                    tinyexons_seen+=1
                    pacbporf.extend_pacbporf_after_stops()
                    tinyexon_crossdata[orgkey]['accepted_pacbs'][pacbpkey] = pacbporf


    if not tinyexons_seen:
        return []
    else:
        # add the nodes/edges from the input graphs as well
        for ( (a,b,c,d),n1,n2 ) in graphL.pacbps.keys():
            orgkey   = ( n1[0], n2[0] )
            pacbpdna = graphL.pacbps[( (a,b,c,d),n1,n2 )]
            if not tinyexon_crossdata.has_key(orgkey):
                tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} }
            tinyexon_crossdata[orgkey]['accepted_pacbs'][(a,b,c,d)] = pacbpdna

        for ( (a,b,c,d),n1,n2 ) in graphR.pacbps.keys():
            orgkey   = ( n1[0], n2[0] )
            pacbpdna = graphR.pacbps[( (a,b,c,d),n1,n2 )]
            if not tinyexon_crossdata.has_key(orgkey):
                tinyexon_crossdata[orgkey] = {'accepted_pacbs': {} }
            tinyexon_crossdata[orgkey]['accepted_pacbs'][(a,b,c,d)] = pacbpdna
    

    # make graph, remove to low connected nodes and split in complete graphs
    tinyexonsg = create_pacbpcollectiongraph_from_crossdata(tinyexon_crossdata)
    tinyexonsg.remove_low_connectivity_nodes(min_connectivity=2)
    splitted_tinyexongraphs = tinyexonsg.find_fully_connected_subgraphs(
                edges=4,
                max_missing_edges=0 )

    # now remove the graphs that are graphL and graphR ;-)
    graphLnodes = graphL.get_nodes()
    graphLnodes.sort()
    graphRnodes = graphR.get_nodes()
    graphRnodes.sort()
    for pos in range(0,len(splitted_tinyexongraphs)):
        tegNodes = splitted_tinyexongraphs[pos].get_nodes()
        tegNodes.sort()
        if tegNodes == graphLnodes:
            splitted_tinyexongraphs.pop(pos)
            break
    for pos in range(0,len(splitted_tinyexongraphs)):
        tegNodes = splitted_tinyexongraphs[pos].get_nodes()
        tegNodes.sort()
        if tegNodes == graphRnodes:
            splitted_tinyexongraphs.pop(pos)
            break

    # make ListOfCodingBlockGraphs
    cbgList = ListOfCodingBlockGraphs(splitted_tinyexongraphs,
            input=input,
            crossdata=tinyexon_crossdata
            )

    # do all what is needed to create K(s) CBGs of these
    cbgList.harvest_pacbps_from_crossdata()
    cbgList.split_codingblock_on_alternatives_in_pacbps_dict(
            filter_for_msr=True,
            filter_for_omsr=True,
            )
    # remove non-compatible CBGs
    cbgList.remove_incompatible_cbgs(
            minimal_node_count=len(input),
            minimal_edge_count=len(tinyexon_crossdata),
            filter_for_msr=True,
            filter_for_omsr=True
            )

    # get list of accepted TinyExonCbgs 
    accepted_tegs = cbgList.codingblockgraphs

    # and update weights by minimal spanning region
    for teg in accepted_tegs: teg.update_edge_weights_by_minimal_spanning_range()

    # and check if they can be placed IN BETWEEN graphL and graphR
    # TODO some prints
    final_graphs_with_tinyexons = []
    for teg in accepted_tegs:

        test_codingblock_order, rejected_graphs = make_consensus_genestructure_from_compatible_pacb_graphs(
                [graphL,graphR,teg],None)

        print "checking hypo TEG:", teg.get_ordered_nodes(), "of", len(accepted_tegs), "len of join", len(test_codingblock_order)


        #empty_input = {}
        #for org in teg.organism_set(): empty_input[org] = None
        #tmpGSG = GenestructureOfCodingBlockGraphs(empty_input)
        #tmpGSG.add_codingblocks([graphL,graphR,teg])
        #print "tinyexon tmp check:", len(test_codingblock_order), len(tmpGSG), teg.get_ordered_nodes()


        wt_after  = teg.total_weight()
        if len(test_codingblock_order) == 3:
            teg_nodes = teg.get_nodes()
            teg_nodes.sort()
            middle = test_codingblock_order[1]
            middle_nodes = middle.get_nodes()
            middle_nodes.sort()
            if middle_nodes == teg_nodes:
                # yahoo, this one is 100% okay!
                final_graphs_with_tinyexons.append( teg )

    if len(final_graphs_with_tinyexons)==1:
        print final_graphs_with_tinyexons[0].get_nodes()
        return final_graphs_with_tinyexons
    elif len(final_graphs_with_tinyexons)>1:
        print "### WARNING!!!! more than 1 tinyexon graph is found."
        print "### WARNING!!!! however, only single one is returned."
        print "### WARNING!!!! returning >1 can cause errors..."
        return [ final_graphs_with_tinyexons[0] ]
    else:
        return []
Esempio n. 7
0
def blastanalysescbgjunction(
        gsg,
        prevCBG,
        nextCBG,
        omit_cbg_orfs=False,
        omit_non_cbg_orfs=False,
        extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
        omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
        verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org, orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf
        for org, orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,
            prevCBG,
            nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
        ).split("\n"))

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org, orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org, orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
                del (fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del (fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):", len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return []

    # check if all organisms are still covered
    orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return []

    # create !single! fasta database
    fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa"
    writeMultiFasta(fastadbmfa, fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ, orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ + "_orf_" + str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       protseq,
                                       fastadbname,
                                       extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS, _orfSid = alignment.title.replace(">",
                                                         "").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS, int(_orfSid))
                orfS = orfs[nodeS]

                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return []

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print[p.bitscore for p in dpcpacbpcol.pacbps.values()]
        print "PCG nodes:", dpcpacbpcol.get_ordered_nodes()
    ################################################################

    #### do some transformations on the pacbpcol
    ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1)
    ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
    ####        edges=gsg.node_count()-1 , max_missing_edges=0 )
    ##### convert to list of CBGs and do some transformations
    ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    ####cbgList.remove_all_but_complete_cbgs()
    ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    ####cbgList.remove_cbgs_without_omsr()
    ####cbgList.update_edge_weights_by_minimal_spanning_range()
    ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2])
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity)
    max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.make_pacbps_for_missing_edges()
    cbgList.remove_all_but_complete_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    # and create_cache() for these CBGs
    for cbg in cbgList:
        cbg.create_cache()

    ####################################################################
    if verbose:
        print stw.lap(), "CBGs created", len(cbgList)
        for newcbg in cbgList:
            print "new:", newcbg
    ####################################################################

    # return list with CBGs
    return cbgList.codingblockgraphs
Esempio n. 8
0
def get_reverse_cbg(cbg, frame, verbose=False):
    """
    Get the ReversecomplementCodingBlockGraph in requested frame of this CBG

    @type  cbg: CodingBlockGraph
    @param cbg: CodingBlockGraph to reversecomplement

    @type  frame: integer
    @param frame: 0,1 or 2

    @type  verbose: Boolean
    @param verbose: print intermediate info to STDOUT for debugging purposes

    @rtype:  ReversecomplementCodingBlockGraph or None
    @return: ReversecomplementCodingBlockGraph (when existing) or None
    """
    min_orf_length = (cbg.omsrlength() / 2) * 3
    orfs = get_reverse_strand_orfsets(cbg,
                                      frame,
                                      min_orf_length=min_orf_length)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    for org in orfs.keys():
        fname = "%s_reversecbg_%s.mfa" % (org, cbg.barcode())
        writeMultiFasta(orfs[org].tofastadict(), fname)
        multifastas[org] = fname
        ########################################################################
        if verbose:
            print "ORFS:", org, len(orfs[org].orfs),
            print[len(o.protein_sequence) for o in orfs[org].orfs]
        ########################################################################

    revpacbps = {}
    for orgQ, orgS in cbg.pairwisecrosscombinations_organism():
        # create blastdb if it does not exist yet
        if not blastdbs.has_key(orgS):
            formatdb(fname=multifastas[orgS])
            blastdbs[orgS] = multifastas[orgS]

        revpacbporfs = {}
        for orfQ in orfs[orgQ].orfs:
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       orfQ.protein_sequence,
                                       dbname="./" + blastdbs[orgS])
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # obtain coordinates from sbjct orf identifier
                orfS = orfs[orgS].get_orf_by_id(
                    alignment.title.replace(">", ""))
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]
                # skip if hsp is very short
                if len(hsp.query) < cbg.omsrlength() / 2: continue

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                    ###pacbporf.print_protein_and_dna()
                ################################################################

                nodeQ = (orgQ, orfQ.protein_startPY)
                nodeS = (orgS, orfS.protein_startPY)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    if not pacbpcol.organism_set_size() == cbg.organism_set_size():
        # no CBG on the reverse strand
        return None

    # ``deepcopy`` PacbPcollection
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ############################################################################
    if verbose:
        print pacbpcol, "bitscores:",
        print[pacbporf.bitscore for pacbporf in dpcpacbpcol.pacbps.values()]
    ############################################################################

    # do some transformations on the pacbpcol
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=cbg.node_count() -
                                           1)
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=cbg.node_count() - 1, max_missing_edges=0)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_complete_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    ############################################################################
    if verbose:
        for revcbg in cbgList:
            print "revCBG:", revcbg
    ############################################################################

    if not cbgList:
        # no CBG on the reverse strand
        return None
    else:
        # return the highest scoring CBG as a ReversecomlementCodingBlockGraph
        return CodingBlockGraph2ReversecomlementCodingBlockGraph(
            cbgList.codingblockgraphs[0])