コード例 #1
0
ファイル: lib_cexpander.py プロジェクト: IanReid/ABFGP
def cexpander2multiplealignment(cxpdr,verbose=False):
    """
    This function and its application are still under development. In future
    version, this cexpander obtained data will replace the (deprecated) PAOC
    and PASC VISTA-like tracks which were far to computationally expensive to
    obtain.
    """
    ########################################################################
    if verbose:
        stw = StopWatch(name="cxpdr2multiplealignment")
        stw.start()
    ########################################################################


    # for each of the _transferblocks (1 for each organism/gene), the
    # binarystring **should** contain an identical number of 1's
    # in freak-accident cases (1 in hundreds of thousand of cases),
    # it is observed that this not the case. Catch this exception here
    # before it hard-crashes with a raise somewhere later in this function
    if len(Set([ trf.binarystring.count("1") for trf in cxpdr._transferblocks ])) > 1:
        print "WARNING: unequal Cexpander.transferblocks.binarystring 1's count:",
        print Set([ trf.binarystring.count("1") for trf in cxpdr._transferblocks ]) 
        return False

    # split the cexpander binarystrings on character changes 0->1 and 1->0
    substrings = {}
    orgs   = [ trf.header for trf in cxpdr._transferblocks ]
    for ipos in range(0,len(orgs)):
        org = orgs[ipos]
        trf = cxpdr._transferblocks[ipos]
        substrings[org] = [ x.group() for x in re.finditer("(1+|0+)",trf.binarystring) ]

    # maximum number of blocks in the cexpander output
    # WARNING TODO THIS IS STILL NOT 100% SAFE!!
    try:
        maxblocks = max(Set([ len(substrings[org]) for org in substrings.keys() ]))
    except:
        print "ERROR in cexpander2multiplealignment"
        print substrings.keys()
        print "inputseqs:", len(cxpdr.sequences)
        for k,v in substrings.iteritems():
            print k, len(v)
            print v
        # now raise the error...
        maxblocks = max(Set([ len(substrings[org]) for org in substrings.keys() ]))

    curblock = 0
    ########################################################################
    if verbose:
        print "maxblocks:", maxblocks,
        print [ len(substrings[org]) for org in substrings.keys() ]
コード例 #2
0
def cbg_cexpander_inframe_intron_search(self,
        min_total_pssm_score = MIN_TOTAL_PSSM_INFRAME_INTRON,
        min_intron_nt_length = MIN_INTRON_NT_LENGTH,
        verbose=False):
        """
        @type  self: CodingBlockGraph
        @param self: CodingBlockGraph instance

        @type  min_total_pssm_score: float
        @param min_total_pssm_score: MIN_TOTAL_PSSM_INFRAME_INTRON

        @type  min_intron_nt_length: integer
        @param min_intron_nt_length: MIN_INTRON_NT_LENGTH

        @type  verbose: Boolean
        @param verbose: print status/debugging messages to STDOUT

        @rtype:  list or False
        @return: list with new (sub)CBGs or False when not splitted
        """
        ########################################################################
        if verbose:
            stw = StopWatch(name="cexpCbgIfIntron")
            stw.start()
        ########################################################################

        # return variable; list of splitted CBGs.
        return_cbg_list = [ self ]

        # create cexpander multiplealignment blocks
        cbgMA = lib_cexpander.cexpander2multiplealignment(self._cexpander,
                verbose=verbose)

        # In freak-accident cases (one in thousends of times), cexpander produces
        # unequal amount of 1's in the binarystrings. This is theoretically impossible.
        # Problem is worked on; in the meanwhile, cexpander2multiplealignment returns
        # False in these cases. Catch this here by quiting current 
        # cbg_cexpander_inframe_intron_search() function call and return False
        TODO=True
        if not cbgMA: return False

        ########################################################################
        if verbose:
            print stw.lap()
            blockscnt = len( cbgMA[ cbgMA.keys()[0] ] )
            print self
            print "BLOCKS:", blockscnt, self._cexpander.binarystring,
            print self._cexpander.projected_on
            for org in cbgMA.keys():
                print org, "\t", 
                for blockid in range(0,blockscnt):
                    if cbgMA[org][blockid].count("1") >= 1:
                        print len(cbgMA[org][blockid]), 
                    else:
                        print cbgMA[org][blockid], 
                print ""
        ########################################################################

        # loop over the aligned cexpander blocks and check the 
        # non-uniformly aligned blocks for length variation
        blockscnt  = len( cbgMA[ cbgMA.keys()[0] ] )
        oricbgomsr = self.overall_minimal_spanning_range()

        for blockid in range(0,blockscnt):
            # obtain non-uniformly aligned AA lengths for this block
            lengths = {}
            for org in cbgMA.keys():
                lengths[org] = cbgMA[org][blockid].count("0")
            # skip the uniformly aligned blocks
            if list(Set(lengths.values())) == [0]: continue
            ####################################################################
            if verbose: print stw.lap(), "lengths:", lengths
            ####################################################################

            # obtain coordinates for this area
            lsrcoords = {}
            for org in cbgMA.keys():
                node = self.node_by_organism(org)
                coordSta = min(oricbgomsr[node])
                # make summation of length of preceeding (non)aligned blocks
                for i in range(0,blockid):
                    coordSta += cbgMA[org][i].count("1") +\
                                cbgMA[org][i].count("0")
                # end coord is start coord + length of current block
                coordEnd = coordSta + lengths[org]
                lsrcoords[org] = ( coordSta, coordEnd )

            ####################################################################
            if verbose: print stw.lap(), "lsrcoords:", lsrcoords
            ####################################################################

            # translate AA lengths to NT lengths
            for k in lengths.keys(): lengths[k] = lengths[k]*3

            # check lenght discrepancy and assign putative inframe introns
            putative_inframe_intron_orgs =\
                _length_discrepancy_to_potential_inframe_introns(lengths)

            if not putative_inframe_intron_orgs:
                # no length discrepancy that can represent an inframe intron
                continue

            # organisms/genes for which an inframe intron can be an improvement
            # data dictionary. Keys: 'max_nt_length', 'min_nt_length', 
            # 'min_donor_pos', 'max_acceptor_pos', 'min_total_pssm'
            inframe_intron_criteria = {}

            # find putative inframe introns in assigned genes/organisms
            putative_inframe_introns = {}
            for org in putative_inframe_intron_orgs:
                # assign inframe intron criteria for this organism
                inframe_intron_criteria[org] = {
                    'min_nt_length'     : min_intron_nt_length,
                    'min_total_pssm'    : min_total_pssm_score,
                    'min_donor_pos'     : (min(lsrcoords[org]) - 5) * 3,
                    'max_acceptor_pos'  : (max(lsrcoords[org]) + 5) * 3,
                    }

                # search for potential introns that can be responsible for this event
                theorf = self.get_orfs_of_graph(organism=org)[0]
                introns = pacb.connecting.merge_orfs_with_intron( theorf,theorf,
                            min_intron_nt_length=min_intron_nt_length
                            )

                ################################################################
                if verbose: print "introns:", org, len(introns), "raw"
                ################################################################

                # filter introns for all outside the OMSR, to short, to long,
                # total pssm_score etc
                introns = _filter_putative_inframe_intron_list(
                        introns,org,inframe_intron_criteria)
                putative_inframe_introns[org] = introns
                ################################################################
                if verbose: print "introns:", org, len(introns), "filtered"
                ################################################################

            # check if all putative_inframe_intron_orgs have indeed introns
            # and check if all have at least a single intron phase in common
            if 0 in [ len(ill) for ill in putative_inframe_introns.values() ]:
                # no introns in one or more organisms/genes -> continue
                continue
            if len( putative_inframe_introns )> 1:
                # do phase check in all organisms/genes
                phases = Set([0,1,2])
                for org, intronlist in putative_inframe_introns.iteritems():
                    thisphases = Set([ intron.phase for intron in intronlist ])
                    phases.intersection_update(thisphases)
                if len(phases) == 0:
                    ################################################################
                    if verbose: print "no mutual phase -> no cbgIF.is_optimal()"
                    ################################################################
                    # no mutual phase -> no cbgIF.is_optimal() possible lateron
                    continue
            else:
                pass

            # if an intron in at least a single organism is still there,
            # then split the involved pacbps in the `original` cbgL, the last
            # added CBG element in the return_cbg_list, and make a (virtual)
            # deepcopy of a novel cbgL. Both CBGs have actually the SAME pacbps!
            cbgR = self.deepcopy()
            cbgL = self.deepcopy()

            # loop over the organisms/genes with inframe introns split
            # the Pacbps of these orgs in both to-become L and R CBGs 
            inframe_intron_orgs = putative_inframe_introns.keys()
            for org in inframe_intron_orgs:
                ################################################################
                if verbose:
                    print "splitting PACBPs for org:", org
                    print "L", cbgL
                    print "R", cbgL
                ################################################################
                node = self.node_by_organism(org)
                replacementsL = {}
                replacementsR = {}
                for (key,node1,node2), pacbporf in cbgL.pacbps.iteritems():
                    if node in [node1,node2]:
                        # get the pacbp of this pacbporf and split it!
                        pacbp = pacb.conversion.pacbporf2pacbp(pacbporf)
                        org1 = self.organism_by_node(node1)
                        org2 = self.organism_by_node(node2)

                        if org1 in putative_inframe_introns.keys() and\
                        org2 in putative_inframe_introns.keys() and\
                        inframe_intron_orgs.index(org) > 0:
                            # already splitted; both orgs are inframe introns!
                            continue

                        # make split coordinates relative
                        splitL = lsrcoords[org1][0] - pacbp.query_start
                        splitR = lsrcoords[org1][1] - pacbp.query_start

                        pacbpL = pacb.splitting.split_pacb_on_coordinates(
                            pacbp,(splitL,splitL),returnside='left')
                        pacbpR = pacb.splitting.split_pacb_on_coordinates(
                            pacbp,(splitR,splitR),returnside='rigth')

                        # check if both cbgL and cbgR make sence
                        # if not -> return False!
                        if not pacbpL: return False
                        if not pacbpR: return False

                        ########################################################
                        if verbose:
                            print "#", node1, node2, lsrcoords[org1], 
                            print "L:", splitL, "R:", splitR
                            print pacbp
                            print pacbpL
                            print pacbpR
                        ########################################################

                        # pacbpL -> extented pacbporfL -> store to replacementsL
                        newpacbporfL = pacb.conversion.pacbp2pacbporf(pacbpL,
                                       pacbporf.orfQ,pacbporf.orfS)
                        newpacbporfL.extend_pacbporf_after_stops()
                        replacementsL[(key,node1,node2)] = newpacbporfL

                        # pacbpR -> extented pacbporfR -> store to replacementsR
                        newpacbporfR = pacb.conversion.pacbp2pacbporf(pacbpR,
                                       pacbporf.orfQ,pacbporf.orfS)
                        newpacbporfR.extend_pacbporf_after_stops()
                        replacementsR[(key,node1,node2)] = newpacbporfR


                # do the pacbporf replacements in both CBGs
                statusL = _update_cbg_with_pacbporf_replacements(
                            cbgL,replacementsL)
                statusR = _update_cbg_with_pacbporf_replacements(
                            cbgR,replacementsR)

                # check if both cbgL and cbgR make sence
                if not statusL or not statusR:
                    # return unchanged cbg status -> False
                    return False
                    


            # Verify the interface between cbgL and cbgR.
            # Most likely, the sites are nicely alignable.
            cbgIF = CodingBlockGraphInterface(cbgL,cbgR)
            cbgIF.force_intron_in_organisms( putative_inframe_introns.keys() )
            cbgIF.allow_intron_in_organisms( putative_inframe_introns.keys() )
            cbgIF.harvest_splice_sites()
            cbgIF.find_conserved_splice_sites()

            ####################################################################
            if verbose:
                print cbgL
                print cbgIF
                print cbgR
                cbgIF.interfaceproperties()
            ####################################################################
            # check the properties of the CBGinterface
            if cbgIF.optimalitycheck().count(True) >= 2:
                # yes; is_compatible and donor and/or acceptor is optimal
                cbgL._CBGinterface3p = cbgIF
                cbgR._CBGinterface5p = cbgIF
                cbgL.copy_5pcbginterface_from_othercbg(self)
                cbgR.copy_3pcbginterface_from_othercbg(self)
                return_cbg_list = [ cbgL, cbgR ]
                ################################################################
                if verbose: print "INFRAME INTRON CONFIRMED!!"
                ################################################################
            else:
                # no compatible interface... although intron(s) was/were found!
                # (at least) two options are now open:
                # 1. enforce the intron(s) and create cbgIF with _forced_ends
                # 2. ignore the intron(s) and create an intermediate lsrCBG

                # 1. is `tricky`. First, how sure is this inframe intron,
                # what type of criteria do we assume etc etc.
                # second, how to create a coorect cbgIF? It must be an
                # IS_SPLITTED interface, of which the boundaries might fall
                # outside the OMSR's of the CBGs.

                # 2. ignore the intron(s) and create an intermediate lsrCBG
                lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(cbgL,lsrCBG)
                prepare_lsrcbg_and_cbg_for_gsg_insertion(lsrCBG,cbgR)
                cbgL.copy_5pcbginterface_from_othercbg(self)
                cbgR.copy_3pcbginterface_from_othercbg(self)
                return_cbg_list = [ cbgL, lsrCBG, cbgR ]
                ################################################################
                if verbose:
                    print "no INFRAME INTRON -> lsrCBG"
                    print cbgL
                    print " ", lsrCBG._CBGinterface5p
                    print " ", lsrCBG
                    print " ", lsrCBG._CBGinterface3p
                    print cbgR
                    self.printmultiplealignment()
                    print cbgL
                    cbgL.printmultiplealignment()
                    print cbgR
                    cbgR.printmultiplealignment()
                ################################################################

        # EOF this function.
        # return False if this CBG remained intact, list of splits when splitted
        if len(return_cbg_list) == 1:
            return False
        else:
            return return_cbg_list
コード例 #3
0
    def search_for_lowsimilarity_regions(self,aligned_intron_min_aa_length=ALIGNED_INTRON_MIN_AA_LENGTH,verbose=False):
        """
        Search CBGs in genestructure for lowsimilarity regions
        """

        ################################################################
        if verbose:
            stw = StopWatch(name='lsrCBGsearch')
            stw.start()
        ################################################################

        # Loop reversed through genestructure to make sure that once
        # a CBG is splitted, the positions of the remainder of the
        # list stay intact.
        for posinGSG in range(len(self)-1,-1,-1):
            sg = self.codingblockgraphs[posinGSG]
            # skip IGNORED, lsrCBG and CBGs that are incomplete (still await HMM completion) 
            if sg.IS_IGNORED: continue
            if sg.__class__.__name__ == 'LowSimilarityRegionCodingBlockGraph': continue
            if sg.node_count() < self.EXACT_SG_NODE_COUNT: continue

            if verbose: print stw.lap(), posinGSG, "start"

            # check for potential aligned intron
            if sg.potentially_contains_aligned_intron(window_aa_size=aligned_intron_min_aa_length):
                ########################################################
                if verbose:
                    print stw.lap(), posinGSG, "found"
                    for k,v in sg.getomsrproteinsequences().iteritems():
                        print ">%s\n%s\n" % (k,v)
                    print "ABOUT TO SPLIT:", sg
                    print sg._cexpander.binarystring,
                    print sg._cexpander.projected_on
                    sg.printmultiplealignment()
                    for k,pacbp in sg.pacbps.iteritems(): print k, pacbp
                ########################################################
                # now actually split by inframe intron
                res = sg.split_codingblock_by_inframe_intron()
                if len(res) == 1:
                    # no inframe intron found here
                    pass
                else:
                    # prepare the CBGs for insertion 
                    for pos in range(0,len(res)):
                        splittedCBG = res[pos]
                        splittedCBG.extend_pacbporfs(self.input)
                        splittedCBG.update_edge_weights_by_minimal_spanning_range()
                        splittedCBG.IS_SPLITTED = True
                        if pos > 0:
                            splittedCBG.IS_5P_SPLITTED = True
                            splittedCBG.IS_FIRST = False
                        if pos < len(res)-1:
                            splittedCBG.IS_3P_SPLITTED = True
                            splittedCBG.IS_LAST = False
                        # (re)create the cache for the splitted CBGs
                        splittedCBG.create_cache()
                        ################################################
                        if verbose:
                            print stw.lap(), posinGSG, "done!"
                            print "SUCCESFULLY SPLITTED:", splittedCBG
                            splittedCBG.printmultiplealignment()
                            print splittedCBG._cexpander.binarystring, 
                            print splittedCBG._cexpander.projected_on
                            print splittedCBG._omsr
                            for trf in splittedCBG._cexpander._transferblocks:
                                print trf.binarystring, trf.projected_on
                            for k,v in splittedCBG._cexpander.inputsequences.iteritems():
                                print v,"\t",k
                            for _org,orflist in splittedCBG.get_orfs_of_graph().iteritems():
                                print orflist[0], _org
                            for pacbp in splittedCBG.pacbps.values():
                                print pacbp
                                pacbp.print_protein(_linesize=100)
                        ################################################

                    # create lsrCBGs and cbgIFs between them by looping in reversed
                    # order over all pairs of CBGs (because lsrCBG insertion in list)
                    for pos in range(len(res)-2,-1,-1):
                        cbgL,cbgR = res[pos:pos+2]
                        lsrCBG = create_intermediate_lowsimilarity_region(cbgL,cbgR)
                        res.insert(pos+1,lsrCBG)
                        # create cbgIF between the CBGs and the lsrCBG
                        # just create -> cbgIF with lsrCBG is immediately is_optimal()
                        cbgIFa = CodingBlockGraphInterface(cbgL,lsrCBG)
                        cbgIFb = CodingBlockGraphInterface(lsrCBG,cbgR)
                        # set cbgIF objects to the CBGs and the lsrCBG
                        cbgL._CBGinterface3p   = cbgIFa
                        lsrCBG._CBGinterface5p = cbgIFa
                        lsrCBG._CBGinterface3p = cbgIFb
                        cbgR._CBGinterface5p   = cbgIFb

                    # update the first and last CBG in this list with the
                    # cbgIFs of the parental CBG (variable sg)
                    res[0]._CBGinterface5p =  sg._CBGinterface5p
                    res[-1]._CBGinterface3p = sg._CBGinterface3p
                    # update the original IS_FIRST/IS_LAST status
                    res[0].IS_FIRST = sg.IS_FIRST
                    res[-1].IS_LAST = sg.IS_LAST

                    # and set splittedCBGs to genestructure
                    # by replacing the existing CBG (variable sg) on the
                    # position posinGSG with the list op splitted CBGs
                    self.codingblockgraphs.__setslice__(posinGSG,posinGSG+1,res)

            else:
                # nope, no potential inframe intron; just append
                ###print sg.total_weight(), False
                pass
コード例 #4
0
    def mine(self, identifier, verbose=None):
        """ """
        # (re)set mined results to empty
        self._data = []
        self._loci = []

        # start timer
        stw = StopWatch("dbwarehouseMiner.mine('%s')" % identifier)
        if verbose: print stw.start()

        # find the current identifier in the warehouse
        identifier = identifier.replace("'", "").replace('"', '').strip()
        if not identifier: return False
        genomedir = self.identifier2genomedir(identifier)
        if not genomedir: return False

        # append the main/central locusdir to the loci
        locusdir = self.identifier2locusdir(identifier, genomedir=genomedir)
        if not locusdir: return False
        self._loci.append(locusdir)

        if verbose: print stw.lap(), "main locus identified"

        # now mine in the warehouse
        if self.SEARCH_METHOD != 'SIMILARITY':
            # set some column restraints as VERY strict (&&) i.s.o loose (||)
            column_restrain = "&&"
        else:
            column_restrain = "||"

        ####genomedirtag   = os.path.basename(os.path.split(genomedir)[0])
        genomedirtag = os.path.basename(genomedir)
        blastarchpatAB = os.path.join(
            self.dbwarehouse_path, "_crossblastp",
            "blast.%s_x_*.symmetrized" % (genomedirtag))
        blastarchpatBA = os.path.join(
            self.dbwarehouse_path, "_crossblastp",
            "blast.*_x_%s.symmetrized" % (genomedirtag))

        basecommand = """ awk -F':' '{ print $1"\\t"$2 }' | awk """ +\
                """ '{ if (($5>=%1.3f %s $6>=%1.3f) && ($7>=%1.3f %s $8>=%1.3f) && """ % (
                    self.MINIMAL_OVERLAP_RATIO,
                    column_restrain,
                    self.MINIMAL_OVERLAP_RATIO,
                    self.MINIMAL_BITSCORE_RATIO,
                    column_restrain,
                    self.MINIMAL_BITSCORE_RATIO,
                    ) +\
                """ (($5/$6)<=%1.2f %s ($6/$5)<=%1.2f)) { print $0"\t"(($5+$6)*$4)/2 } }' """ % (
                    self.MAXIMAL_LENGTH_RATIO,
                    column_restrain,
                    self.MAXIMAL_LENGTH_RATIO,
                    ) +\
                """ | sort -gr -k 9 """

        # commands with grep and zgrep for *.symmetrized and *.symmetrized.gz files
        command_grep = """grep "%s" %s %s | sort -u | %s""" % (
            identifier, blastarchpatAB, blastarchpatBA, basecommand)
        command_zgrep = """zgrep "%s" %s %s | sort -u | %s""" % (
            identifier, blastarchpatAB + ".gz", blastarchpatBA + ".gz",
            basecommand)

        # run the grep command
        ci, co, ce = os.popen3(command_grep)
        ci.close()
        lines = co.readlines()
        co.close()
        ce.close()

        # run the zgrep command
        ci, co, ce = os.popen3(command_zgrep)
        ci.close()
        lines.extend(co.readlines())
        co.close()
        ce.close()

        seentags = []
        ignoretags = []
        for line in lines:
            fname, idA, idB, bitscore, overlapA, overlapB, ratioA, ratioB, order = line.strip(
            ).split("\t")
            if fname.find(".symmetrized.gz") >= 0:
                # process the lines obtained with the zgrep command
                tagA, tagB = fname[0:fname.find(".symmetrized.gz"
                                                )][fname.find("/blast.") +
                                                   7:].split("_x_")
            else:
                # process the lines obtained with the (normal) grep command
                tagA, tagB = fname[0:fname.find(".symmetrized"
                                                )][fname.find("/blast.") +
                                                   7:].split("_x_")

            # ignore the line completely when a limitation on genomedirs is applied and valid
            if self.genometags_to_use:
                if not (tagA in self.genometags_to_use
                        and tagB in self.genometags_to_use):
                    continue
            if self.genometags_to_ignore:
                if tagA in self.genometags_to_ignore or tagB in self.genometags_to_ignore:
                    continue

            # ignore this line when one of the tags are (in) ignoretags
            if tagA in ignoretags: continue
            if tagB in ignoretags: continue

            # swap tagA & tagB when the tag's are in reversed order
            # this is due to the dbwarehouse crossblastp files
            # blast.B_x_A.symmetrized.gz isa symbolic link to
            # blast.A_x_B.symmetrized.gz if B > A (in string order)
            ordered_tags = [tagA, tagB]
            ordered_tags.sort()
            if [tagA, tagB] != ordered_tags:
                # swap tagA & tagB
                tagA, tagB = tagB, tagA

            if self.SEARCH_METHOD == 'HOMOLOGS':
                if self.ALLOW_PARALOGS:
                    pass
                else:
                    if tagA == tagB:
                        continue
                    if tagA in seentags and tagB in seentags:
                        continue
            elif self.SEARCH_METHOD == 'BDBH':
                if tagA == tagB and self.ALLOW_PARALOGS:
                    if tagA in [tup[0] for tup in self._data]:
                        continue  # there is already a fine hit gathered
                    else:
                        pass
                else:
                    if tagA == tagB and not self.ALLOW_PARALOGS:
                        continue
                    if tagA in seentags and tagB in seentags:
                        continue

            elif self.SEARCH_METHOD == 'SAFEORTHOLOGS':
                if tagA == tagB:
                    # check if there is not a paralog in the identifier's species it self
                    # that is to close nearby this identifier (a hypothetical paralogue)
                    ratioA, ratioB = float(ratioA), float(ratioB)
                    if max([ratioA, ratioB]) > self.SAFEORTHOLOGS_RATIO:
                        # there is in its own genome a hypothetical paralogue!
                        # empty data and break out!
                        self._data = []
                        break
                    else:
                        continue

                elif tagA in seentags and tagB in seentags:
                    if idA == identifier:
                        ratio = float(ratioA)
                        thetag = tagB
                    else:
                        ratio = float(ratioB)
                        thetag = tagA
                    maxratio = self._getfromdata(self._data, thetag)[5]
                    if min([ratio / maxratio, maxratio / ratio
                            ]) > self.SAFEORTHOLOGS_RATIO:
                        # remove this tag from data -> ortholog assignment is not 100% shure!
                        self._removefromdata(self._data, thetag)
                        ignoretags.append(thetag)
                        continue
                    else:
                        continue
                else:
                    pass

            else:
                # mode similarity -> all hits are okay
                pass

            # append tags to seentags
            if tagA not in seentags: seentags.append(tagA)
            if tagB not in seentags: seentags.append(tagB)

            # if here, a similar protein is mined!
            # gather locusdir and similarity data
            bitscore = int(float(bitscore))
            overlapA = float(overlapA)
            overlapB = float(overlapB)
            ratioA = float(ratioA)
            ratioB = float(ratioB)
            #if idA == identifier:
            #    self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ))
            #    ###print line.strip()
            #    ###print "A", self._data[-1],"\n"
            #else:
            #    self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA ))
            #    ###print line.strip()
            #    ###print "B", self._data[-1],"\n"

            if idA == identifier:
                self._data.append(
                    (tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB))
            elif idB == identifier:
                self._data.append(
                    (tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA))
            elif idA.find(identifier) == 0:
                self._data.append(
                    (tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB))
            elif idB.find(identifier) == 0:
                self._data.append(
                    (tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA))
            else:
                print "WHAT ELSE!?::", tagA, tagB, idA, idB, bitscore, overlapA, overlapB, ratioA, ratioB

        # remove the TEMPORARILY element in mode SAFEORTHOLOGS
        if self.SEARCH_METHOD == 'SAFEORTHOLOGS':
            self._removefromdata(self._data, genomedirtag)

        # order _data on bitscore
        tmpdata = []
        for item in self._data:
            tmpdata.append((item[2], item))
        tmpdata.sort()
        tmpdata.reverse()
        self._data = [item for (s, item) in tmpdata]

        print len(self._data), self.maximal_num_loci

        # remove _data elements when self.maximal_num_loci is exceeded
        if len(self._data) > self.maximal_num_loci - 1:
            if (self.verbose and verbose == None) or verbose:
                # print the removed loci to screen
                print "# removed loci (%s): --maximal_num_loci (%s) exceeded" % (
                    len(self._data) - self.maximal_num_loci + 1,
                    self.maximal_num_loci)
                for tup in self._data[self.maximal_num_loci - 1:]:
                    row = list(tup)
                    row.insert(0, genomedirtag)
                    row.insert(2, identifier)
                    print "\t".join([str(elem) for elem in row])
            # now actually remove the rows from _data
            # minus 1 is for the --identifier locus itself
            self._data = self._data[0:self.maximal_num_loci - 1]

        # get the loci belonging to the mined similar proteins
        for (tagB, idB, bitscore, overlapA, overlapB, ratioA,
             ratioB) in self._data:
            tagBgenomedir = os.path.join(self.dbwarehouse_path, tagB)
            locusdir = self.identifier2locusdir(idB, genomedir=tagBgenomedir)
            if not locusdir: print "HEROOO...."
            self._loci.append(locusdir)

        # add genomedirtag and identifier to _data rows
        for i in range(0, len(self._data)):
            row = list(self._data[i])
            row.insert(0, genomedirtag)
            row.insert(2, identifier)
            self._data[i] = tuple(row)

        if (self.verbose and verbose == None) or verbose:
            # print the results!
            print "# main (1th) and mined loci"
            for locus in self._loci:
                print locus
            print "# similarity data"
            #for ( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ) in self._data:
            #    print "\t".join([ str(elem) for elem in [genomedirtag, tagB, identifier, idB, bitscore, overlapA, overlapB, ratioA, ratioB ]])
            for row in self._data:
                print "\t".join([str(elem) for elem in row])
            print "# settings/options"
            print "seentags:  ", seentags
            print "ignoretags:", ignoretags
            print "use:       ", self.genometags_to_use
            print "ignore:    ", self.genometags_to_ignore
            print "# timing/performace"
            print stw.lap()

        return self._loci, self._data
コード例 #5
0
    def construct_final_tiny_cbg(self,
        max_exon_nt_length=SHORT_TAILINGEXON_MAX_NT_LENGTH,
        max_intron_nt_length=SHORT_TAILINGEXON_MAX_INTRON_NT_LENGTH,
        take_max_best_acceptors=SHORT_TAILINGEXON_TAKE_MAX_BEST_ACCEPTORS,
        take_max_best_ecgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_ECGS,
        take_max_best_cbgs=SHORT_TAILINGEXON_TAKE_MAX_BEST_CBGS,
        maximal_current_stopcodongraph_average_weight=0.90,
        minimal_last_vs_new_identity_ratio=0.80,
        maximal_cexpander_cbg_tail_uniformity_aa_length=3,
        elegiable_donor_omsr_nt_offset=21,
        verbose=False):
        """
        Make a tiny final CBG by ``shooting tiny exons into the deep``
        """
        # get current last CBG
        last = self.get_final_cbg()

        # check if final tail of this CBG is uniformaly alignable
        cxpdrOutput = cexpanderanalyses_omsr2orfend(last)
        IS_UNIFORMLY_ALIGNED = True
        for trf in cxpdrOutput._transferblocks:
            if trf.binarystring[-maximal_cexpander_cbg_tail_uniformity_aa_length:].count("0"):
                IS_UNIFORMLY_ALIGNED = False
                break

        ############################################################
        if verbose:
            print "Cexpander uniformaly aligned:",
            print maximal_cexpander_cbg_tail_uniformity_aa_length,
            print "->", IS_UNIFORMLY_ALIGNED
            print "omsr:       ", last._cexpander.projected_on,
            print last._cexpander.binarystring
            trf = cxpdrOutput.get_transfer_of_projected_on(
                    last._cexpander.projected_on)
            if trf and trf != True:
                print "omsr2orfend:", last._cexpander.projected_on,
                print trf.binarystring
        ############################################################

        if IS_UNIFORMLY_ALIGNED:
            # break out of this function. Chance of overpredicting
            # a final tiny exon is bigger then finding a True one!
            return False

        # check if the stopcodongraph is not (very) good already
        if last._stopcodongraph.average_weight() >=\
        maximal_current_stopcodongraph_average_weight:
            # break out of this function. Chance of overpredicting
            # a final tiny exon is bigger then finding a True existing one
            return False

        # start the timer (performance benchmark in verbose mode)
        stw = StopWatch(name='stwFinalECG')
        stw.start()

        # get FinalExons on elegiable Orfs based on distance towards OMSR of
        # current last CBG and minimal acceptor site score
        omsr  = last.overall_minimal_spanning_range()
        maxsr = last.maximal_spanning_range()
        ECG = ExonCollectionGraph()

        ################################################################
        if verbose:
            print "currentLAST", last
            print last._stopcodongraph
            print last._stopcodongraph.is_optimal()
            for org in last.organism_set():
                print org, last._stopcodongraph.is_optimal(organism=org)
            for organism in last.organism_set():
                node = last.node_by_organism(organism)
                theorf = last.get_orfs_of_graph(organism=organism)[0]
                print organism, "\t", node, "\t", max(omsr[node]), "\t",
                print max(maxsr[node]), theorf.endPY/3
        ################################################################

        for organism in last.organism_set():
            node = last.node_by_organism(organism)
            # calculate an offset for the acceptor position
            # variable elegiable_acceptor_omsr_nt_offset is needed to
            # enlarge the OMSR definded offset. When the OMSR is by chance
            # a few nt or aa larger than the actual exon length, the true
            # acceptor position can be erroneously abandoned.
            offset = max(omsr[node]) * 3 - elegiable_donor_omsr_nt_offset 
            theorf = last.get_orfs_of_graph(organism=organism)[0]

            # check if this final orf is self can serve as a final extension
            remaining_orf_nt_length          = (theorf.protein_endPY - max(omsr[node])) * 3
            remaining_maxsr_nt_length        = (max(maxsr[node]) - max(omsr[node])) * 3
            remaining_maxsr_tostop_nt_length = (theorf.protein_endPY - max(maxsr[node])) * 3 


            FIND_NEW_FINAL_ORFS       = True
            STORE_CURRENT_ORF_AS_FIOO = False 
            if remaining_maxsr_nt_length >= max_exon_nt_length:
                # exceptionally large maxsr on rigth side of omsr
                # store as FIOO but to NOT search for an orf extension!
                ### FIND_NEW_FINAL_ORFS       = False # discarded 17/09/2009; when poos maxsr present, overruled!
                STORE_CURRENT_ORF_AS_FIOO = True
            elif remaining_maxsr_tostop_nt_length <= 18:
                # maxsr is less then 6 AA apart from stop on current orf
                #FIND_NEW_FINAL_ORFS       = False
                STORE_CURRENT_ORF_AS_FIOO = True
            elif remaining_orf_nt_length < max_exon_nt_length:
                # final piece of unaligned sequence is a perfect HMM seed
                STORE_CURRENT_ORF_AS_FIOO = True
            else:
                pass

            if STORE_CURRENT_ORF_AS_FIOO:
                cbs = CodingBlockStart( theorf.aapos2dnapos( max(omsr[node]) ) )
                # set pssm_score to (very) high; this rewards
                # using the current Orf as the last Orf
                cbs.pssm_score = 20.0
                fioo = FinalExonOnOrf(cbs,theorf.endPY,theorf)
                node = (organism,theorf.id,fioo.start,fioo.end)
                ECG.add_node_and_object(node,fioo)
                ################################################################
                if verbose:
                    print organism,theorf.id,"self==potential last exon", remaining_orf_nt_length
                    print organism, theorf.id, fioo, fioo.start,fioo.end, theorf.endPY
                ################################################################

            if not FIND_NEW_FINAL_ORFS:
                # quit here -> no orf extension of this CBG
                continue

            # get elegiable (new) final orfs
            orflist = self.input[organism]['orfs'].get_elegiable_orfs(
                    max_orf_start=offset+max_intron_nt_length,
                    min_orf_end=offset )
            ################################################################
            if verbose:
                print organism, [ orf.id for orf in orflist ], "offset:", offset, offset/3
            ################################################################
            for orf in orflist:
                results = find_tailing_exon_on_orf(
                        theorf,orf,
                        current_donor_pos=offset,
                        max_tailingexon_nt_length=max_exon_nt_length,
                        max_tailingexon_intron_nt_length=max_intron_nt_length,
                        )
                for exon,intron in results:
                    node = (organism,orf.id,exon.start,exon.end)
                    if node not in ECG.get_nodes():
                        ECG.add_node_and_object(node,exon)
                        if verbose: print organism, node, exon

        if verbose: print stw.lap(), "Exon objects gathered", ECG.node_count()

        # now take only the best `take_max_best_acceptors`
        # because there can be quite some of them!
        for organism in ECG.organism_set():
            objects = ordering.order_list_by_attribute( ECG.get_organism_objects(organism), order_by='pssm_score', reversed=True )
            for obj in objects[take_max_best_acceptors:]:
                node = (organism,obj.orf.id,obj.start,obj.end)
                ECG.del_node(node)
                if verbose: print "deleted:", node, obj.orf.id, obj.pssm_score

        ########################################################################
        if verbose:
            print stw.lap(), ">take_max_best_acceptors DELETED"
            for organism in ECG.organism_set():
                for obj in ordering.order_list_by_attribute(
                    ECG.get_organism_objects(organism),
                    order_by='pssm_score', reversed=True
                    ):
                    print "remaining", organism, obj.orf.id, obj.length, obj
        ######################################################################## 

        # only continue if all organisms are represented in the ECG
        if last.organism_set_size() > ECG.organism_set_size():
            if verbose: print "To few organisms/genes present -> return False"
            return False

        # create edges in the ECG between compatible phases and 
        # exon length, then make pacbps for these edges
        ECG.create_edges()
        ECG.make_pacbps_for_edges()
        if verbose:
            print stw.lap(), "edges + PACBPS created:", ECG.edge_count(), ECG.node_count(), len(ECG.pacbps)

        # search for complete graphs in this
        last_exon_graphs = ECG.find_fully_connected_subgraphs()

        ########################################################################
        if verbose: 
            print stw.lap(), "duration of ECG.find_fully_connected_subgraphs()",
            print len(last_exon_graphs)
        ########################################################################

        # only continue if there is an perfectly aligned last exon graph
        if not (last_exon_graphs and last_exon_graphs[0].connectivitysaturation() == 1.0):
            ####################################################################
            if verbose: print "no perfect aligned last exon graph -> return False"
            ####################################################################
            return False

        # convert to CodingBlockGraphs
        new_last_cbgs = []
        for leg in last_exon_graphs[0:take_max_best_ecgs]:
            cbg = ExonCollectionGraph2CodingBlockGraph(leg,is_last=True,lastCBG=last)
            if cbg != False and cbg != None and cbg.organism_set_size() == last.organism_set_size():
                # create cache of CBG and do final check on quality
                cbg.create_cache()
                if (cbg.total_weight() < 0 or cbg.omsrlength() <= 10) and\
                cbg._cexpander.binarystring.find("1") == -1:
                    # discard hardly alignable CBGs
                    continue
                # if here, then append this cbg as a possible novel final CBG
                new_last_cbgs.append( cbg )
                ################################################################
                if verbose: print "LEGcbg", cbg
                ################################################################

        ########################################################################
        if verbose: print stw.lap(), "ECGs converted to CBGs", len(new_last_cbgs)
        ########################################################################

        if not new_last_cbgs:
            ####################################################################
            if verbose: print "no ecgs convertable to CBGs -> return False"
            ####################################################################
            return False

        # order by total weight, get the optimal CBG and its corresponding ECG
        new_last_cbgs = ordering.order_graphlist_by_total_weight(new_last_cbgs)
        theNewLastCbg = None
        cbgIF = None


        # check all interfaces between the novel final CBGs and the previous
        # CBG. The best interface is added to the GSG!
        cbgif_accepted_new_last_cbgs = []
        already_checked_node_sets = []

        for newcbg in new_last_cbgs[0:take_max_best_cbgs]:
            lastExonGraph = newcbg._ExonCollectionGraph
            del( newcbg._ExonCollectionGraph )

            # check if it is not the extention of the current
            # last CBG (identical nodes)
            if len(last.node_set().symmetric_difference(newcbg.node_set())) == 0:
                if verbose: print "newCBG is the extention of current last CBG!!"
                continue

            # check if this combination of nodes (orfs) has not been tried already
            if newcbg.get_ordered_nodes() in already_checked_node_sets:
                ###############################################################
                if verbose: 
                    print "newCBG node set done earlier:", 
                    print newcbg.get_ordered_nodes()
                ###############################################################
                continue
            else:
                # append this set of nodes (as a list) to checklist
                already_checked_node_sets.append( newcbg.get_ordered_nodes() )

            # check if this new final tinyexon graph has a compatible interface
            # with the current last one
            cbgIF = CodingBlockGraphInterface(last,newcbg)
            cbgIF.harvest_splice_sites()
            distinct_orgs = []
            for node in lastExonGraph.get_nodes():
                exon = lastExonGraph.get_node_object(node)
                if exon.acceptor.__class__.__name__ == 'SpliceAcceptor':
                    distinct_orgs.append( lastExonGraph.organism_by_node(node) )
            cbgIF.allow_intron_in_organisms(distinct_orgs)
            cbgIF.find_conserved_splice_sites()
            # do NOT optimize -> consumes a lot of time and is helpfull
            # only in extreme cases...
            #cbgIF.optimize()

            if not cbgIF.is_compatible():
                ################################################################
                if verbose:
                    print "newCBG not a is_compatible() cbgIF"
                    print newcbg
                ################################################################
                continue

            # append to cbgif_accepted_new_last_cbgs
            newcbg._CBGinterface5p = cbgIF
            cbgif_accepted_new_last_cbgs.append(
                    (
                        cbgIF.optimalitycheck().count(True),
                        newcbg.total_weight(),
                        newcbg
                    )
                )

        ########################################################################
        if verbose:
            print stw.lap(), "cbgIFs checked %s/%s" % (
                len(cbgif_accepted_new_last_cbgs),
                len(new_last_cbgs[0:take_max_best_cbgs])
                )
        ########################################################################
        # now start by adding the highest scoring newcbg first
        cbgif_accepted_new_last_cbgs.sort()
        cbgif_accepted_new_last_cbgs.reverse()

        ########################################################################
        if verbose:
            print "candidate novel final CBGs:", len(cbgif_accepted_new_last_cbgs)
            for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs:
                print true_cnt,totalwt,newcbg._CBGinterface5p
                print newcbg
        ########################################################################

        for (true_cnt,totalwt,newcbg) in cbgif_accepted_new_last_cbgs:
            # get the already created cbgIF from the newcbg graph
            cbgIF = newcbg._CBGinterface5p
    
            # now check 4 criteria:
            # (1) cbgIF.is_optimal() (2) >GTG.identity
            # (3) >STG.totalweight   (4) <STG.distance
            criteria = []
            criteria.append( cbgIF.is_optimal() )
            criteria.append( newcbg._stopcodongraph.total_weight() > last._stopcodongraph.total_weight() )
            criteria.append( newcbg.genetree().identity() > last.genetree().identity() )
            criteria.append( newcbg._stopcodongraph.stopcodon2omsrdistance() <= last._stopcodongraph.stopcodon2omsrdistance() )

            ####################################################################
            if verbose:
                print "TRYING ADDITION of final newcbg", criteria
                print true_cnt,totalwt,newcbg._CBGinterface5p
                print newcbg
            ####################################################################

            # check if there is only a single different node/orf changed in the newcbg
            # this is recognized by a symmetric_difference of size 2 
            # in this case, be very strict! This easily causes overprediction (FP) tiny exons 
            if len(last.node_set().symmetric_difference(newcbg.node_set())) == 2:
                # check if 4 criteria are valid;
                # a single False results in not accepting this new last tiny cbg
                if False in criteria:
                    if verbose: print "# NOVEL lastTinyExon discarded; single orf extension, criteria", criteria
                    # continue -> no new tiny CBG
                    continue

            # now start check the criteria.
            # if criteria[0] == True, means a fully is_optimal interface!
            # do not perform any additional check, just add!
            if criteria[0] == True:
                theNewLastCbg = newcbg
                break
            
            # total weight criterion -> new.tw() > last.tw()
            if criteria[1] == False:
                ##########################################################################
                if verbose:
                    print "# NOVEL lastTinyExon discarded; to low total weight"
                    print "#", newcbg._stopcodongraph
                ##########################################################################
                # continue -> no new tiny CBG
                continue

            # identity criterion -> allow a ratio i.s.o. new.id() > last.id()
            # this strict criterion (>) is applied for single-new-orf-CBGs
            if criteria[2] == False:
                ratio = newcbg.genetree().identity() / last.genetree().identity()
                if ratio < minimal_last_vs_new_identity_ratio:
                    ######################################################################
                    if verbose:
                        print "# NOVEL lastTinyExon discarded; to low identity"
                        print "#", newcbg._stopcodongraph, newcbg.genetree().identity()
                    ######################################################################
                    # continue -> no new tiny CBG
                    continue
 
            if criteria[3] == False:
                ##########################################################################
                if verbose:
                    print "# NOVEL lastTinyExon discarded; higher stopcodon2omsrdistance"
                    print "#", newcbg._stopcodongraph
                ##########################################################################
                # continue -> no new tiny CBG
                continue
 
            # if this point is reached, a new tiny last CBG has been found!
            theNewLastCbg = newcbg
            # break out of the for loop; store into the genestructure
            break



        # all okay -> ready for inserting the new CBG
        if theNewLastCbg and verbose:
            ################################################################################
            print "NEW FINAL TINY EXON FOUND!!"
            print theNewLastCbg
            print cbgIF, cbgIF.is_optimal(), cbgIF.is_acceptable()
            print cbgIF._optimal_aligned_donor, cbgIF.donor_phase()
            print cbgIF._optimal_aligned_acceptor, cbgIF.acceptor_phase()
            ################################################################################

        # hard-insert into the genestructure
        # using add_codingblock is likely to cause problems
        # because of the tinyness of the CBG
        if theNewLastCbg:
            for pos in range(0,len(self)):
                if self.codingblockgraphs[pos].IS_IGNORED: continue
                if self.codingblockgraphs[pos].IS_LAST:
                    thelast = self.codingblockgraphs[pos]
                    thelast.IS_LAST = False
                    newcbg.IS_LAST  = True
                    self.codingblockgraphs.insert(pos+1,theNewLastCbg)
                    # set the CBGInterface object in next and prev CBG
                    self.codingblockgraphs[pos]._CBGinterface3p = cbgIF
                    self.codingblockgraphs[pos+1]._CBGinterface5p = cbgIF
                    # break out; end of this function
                    break

            # done! return a True because newcbg is created & inserted
            return True
        else:
            # no newLastCbg found
            return False
コード例 #6
0
ファイル: abgpdbwarehouseminer.py プロジェクト: IanReid/ABFGP
    def mine(self,identifier,verbose=None):
        """ """
        # (re)set mined results to empty
        self._data = []
        self._loci = []

        # start timer
        stw = StopWatch("dbwarehouseMiner.mine('%s')" % identifier )
        if verbose: print stw.start()

        # find the current identifier in the warehouse
        identifier = identifier.replace("'","").replace('"','').strip()
        if not identifier: return False
        genomedir = self.identifier2genomedir(identifier)
        if not genomedir: return False

        # append the main/central locusdir to the loci
        locusdir = self.identifier2locusdir(identifier,genomedir=genomedir)
        if not locusdir: return False
        self._loci.append( locusdir )

        if verbose: print stw.lap(), "main locus identified"

        # now mine in the warehouse
        if self.SEARCH_METHOD != 'SIMILARITY':
            # set some column restraints as VERY strict (&&) i.s.o loose (||)
            column_restrain = "&&"
        else:
            column_restrain = "||"

        ####genomedirtag   = os.path.basename(os.path.split(genomedir)[0])
        genomedirtag   = os.path.basename(genomedir)
        blastarchpatAB = os.path.join(self.dbwarehouse_path,"_crossblastp","blast.%s_x_*.symmetrized" % (genomedirtag))
        blastarchpatBA = os.path.join(self.dbwarehouse_path,"_crossblastp","blast.*_x_%s.symmetrized" % (genomedirtag))

        basecommand = """ awk -F':' '{ print $1"\\t"$2 }' | awk """ +\
                """ '{ if (($5>=%1.3f %s $6>=%1.3f) && ($7>=%1.3f %s $8>=%1.3f) && """ % (
                    self.MINIMAL_OVERLAP_RATIO,
                    column_restrain,
                    self.MINIMAL_OVERLAP_RATIO,
                    self.MINIMAL_BITSCORE_RATIO,
                    column_restrain,
                    self.MINIMAL_BITSCORE_RATIO,
                    ) +\
                """ (($5/$6)<=%1.2f %s ($6/$5)<=%1.2f)) { print $0"\t"(($5+$6)*$4)/2 } }' """ % (
                    self.MAXIMAL_LENGTH_RATIO,
                    column_restrain,
                    self.MAXIMAL_LENGTH_RATIO,
                    ) +\
                """ | sort -gr -k 9 """

        # commands with grep and zgrep for *.symmetrized and *.symmetrized.gz files
        command_grep = """grep "%s" %s %s | sort -u | %s""" % (
                identifier,
                blastarchpatAB,blastarchpatBA,
                basecommand)
        command_zgrep = """zgrep "%s" %s %s | sort -u | %s""" % (
                identifier,
                blastarchpatAB+".gz",blastarchpatBA+".gz",
                basecommand)

        # run the grep command
        ci,co,ce = os.popen3(command_grep)
        ci.close()
        lines = co.readlines()
        co.close()
        ce.close()

        # run the zgrep command
        ci,co,ce = os.popen3(command_zgrep)
        ci.close()
        lines.extend( co.readlines() )
        co.close()
        ce.close()

        seentags = []
        ignoretags = []
        for line in lines:
            fname, idA, idB, bitscore, overlapA, overlapB, ratioA, ratioB, order = line.strip().split("\t")
            if fname.find(".symmetrized.gz") >= 0:
                # process the lines obtained with the zgrep command
                tagA,tagB = fname[0:fname.find(".symmetrized.gz")][fname.find("/blast.")+7:].split("_x_")
            else:
                # process the lines obtained with the (normal) grep command
                tagA,tagB = fname[0:fname.find(".symmetrized")][fname.find("/blast.")+7:].split("_x_")

            # ignore the line completely when a limitation on genomedirs is applied and valid
            if self.genometags_to_use:
                if not (tagA in self.genometags_to_use and tagB in self.genometags_to_use):
                    continue
            if self.genometags_to_ignore:
                if tagA in self.genometags_to_ignore or tagB in self.genometags_to_ignore:
                    continue

            # ignore this line when one of the tags are (in) ignoretags
            if tagA in ignoretags: continue
            if tagB in ignoretags: continue

            # swap tagA & tagB when the tag's are in reversed order
            # this is due to the dbwarehouse crossblastp files
            # blast.B_x_A.symmetrized.gz isa symbolic link to
            # blast.A_x_B.symmetrized.gz if B > A (in string order)
            ordered_tags = [ tagA, tagB ]
            ordered_tags.sort()
            if [ tagA, tagB ] != ordered_tags:
                # swap tagA & tagB 
                tagA,tagB = tagB,tagA


            if self.SEARCH_METHOD == 'HOMOLOGS':
                if self.ALLOW_PARALOGS:
                    pass
                else:
                    if tagA == tagB:
                        continue
                    if tagA in seentags and tagB in seentags:
                        continue
            elif self.SEARCH_METHOD == 'BDBH':
                if tagA == tagB and self.ALLOW_PARALOGS:
                    if tagA in [ tup[0] for tup in self._data ]:
                        continue # there is already a fine hit gathered
                    else:
                        pass
                else:
                    if tagA == tagB and not self.ALLOW_PARALOGS:
                        continue
                    if tagA in seentags and tagB in seentags:
                        continue


            elif self.SEARCH_METHOD == 'SAFEORTHOLOGS':
                if tagA == tagB:
                    # check if there is not a paralog in the identifier's species it self
                    # that is to close nearby this identifier (a hypothetical paralogue)
                    ratioA, ratioB = float(ratioA), float(ratioB)
                    if max([ratioA,ratioB]) > self.SAFEORTHOLOGS_RATIO:
                        # there is in its own genome a hypothetical paralogue!
                        # empty data and break out!
                        self._data = []
                        break
                    else:
                        continue

                elif tagA in seentags and tagB in seentags:
                    if idA == identifier:
                        ratio = float(ratioA)
                        thetag = tagB
                    else:
                        ratio = float(ratioB)
                        thetag = tagA
                    maxratio = self._getfromdata(self._data,thetag)[5]
                    if min([ratio/maxratio, maxratio/ratio]) > self.SAFEORTHOLOGS_RATIO:
                        # remove this tag from data -> ortholog assignment is not 100% shure!
                        self._removefromdata(self._data,thetag)
                        ignoretags.append(thetag)
                        continue
                    else:
                        continue
                else:
                    pass

            else:
                # mode similarity -> all hits are okay
                pass


            # append tags to seentags
            if tagA not in seentags: seentags.append(tagA)
            if tagB not in seentags: seentags.append(tagB)

            # if here, a similar protein is mined!
            # gather locusdir and similarity data
            bitscore = int(float(bitscore))
            overlapA = float(overlapA)
            overlapB = float(overlapB)
            ratioA   = float(ratioA)
            ratioB   = float(ratioB)
            #if idA == identifier:
            #    self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ))
            #    ###print line.strip()
            #    ###print "A", self._data[-1],"\n"
            #else:
            #    self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA ))
            #    ###print line.strip()
            #    ###print "B", self._data[-1],"\n"

            if idA == identifier:
                self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ))
            elif idB == identifier:
                self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA ))
            elif idA.find(identifier) == 0:
                self._data.append(( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ))
            elif idB.find(identifier) == 0:
                self._data.append(( tagA, idA, bitscore, overlapB, overlapA, ratioB, ratioA ))
            else:
                print "WHAT ELSE!?::", tagA,tagB,idA,idB,bitscore, overlapA, overlapB, ratioA, ratioB



        # remove the TEMPORARILY element in mode SAFEORTHOLOGS
        if self.SEARCH_METHOD == 'SAFEORTHOLOGS':
            self._removefromdata(self._data,genomedirtag)

        # order _data on bitscore
        tmpdata = []
        for item in self._data:
           tmpdata.append( ( item[2], item ) )
        tmpdata.sort()
        tmpdata.reverse()
        self._data = [ item for (s,item) in tmpdata ]

        print len(self._data), self.maximal_num_loci

        # remove _data elements when self.maximal_num_loci is exceeded
        if len(self._data) > self.maximal_num_loci -1:
            if (self.verbose and verbose==None) or verbose:
                # print the removed loci to screen
                print "# removed loci (%s): --maximal_num_loci (%s) exceeded" % (
                    len(self._data)-self.maximal_num_loci+1, self.maximal_num_loci )
                for tup in self._data[self.maximal_num_loci-1:]:
                    row = list( tup )
                    row.insert(0,genomedirtag)
                    row.insert(2,identifier)
                    print "\t".join([ str(elem) for elem in row ])
            # now actually remove the rows from _data
            # minus 1 is for the --identifier locus itself
            self._data = self._data[0:self.maximal_num_loci-1]


        # get the loci belonging to the mined similar proteins
        for ( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ) in self._data:
            tagBgenomedir = os.path.join(self.dbwarehouse_path,tagB)
            locusdir = self.identifier2locusdir(idB,genomedir=tagBgenomedir)
            if not locusdir: print "HEROOO...."
            self._loci.append( locusdir )

        # add genomedirtag and identifier to _data rows
        for i in range(0,len(self._data)):
            row = list( self._data[i] )
            row.insert(0,genomedirtag)
            row.insert(2,identifier)
            self._data[i] = tuple(row)

        if (self.verbose and verbose==None) or verbose:
            # print the results!
            print "# main (1th) and mined loci"
            for locus in self._loci:
                print locus
            print "# similarity data"
            #for ( tagB, idB, bitscore, overlapA, overlapB, ratioA, ratioB ) in self._data:
            #    print "\t".join([ str(elem) for elem in [genomedirtag, tagB, identifier, idB, bitscore, overlapA, overlapB, ratioA, ratioB ]])
            for row in self._data:
                print "\t".join([ str(elem) for elem in row ])
            print "# settings/options"
            print "seentags:  ", seentags
            print "ignoretags:", ignoretags
            print "use:       ", self.genometags_to_use
            print "ignore:    ", self.genometags_to_ignore
            print "# timing/performace"
            print stw.lap()

        return self._loci, self._data
コード例 #7
0
ファイル: merging.py プロジェクト: IanReid/ABFGP
def merge_pacbporfs(old,new,side,verbose=False):
    """
    Amalgate 2 (near) identical pacbporfs with a slighly different alignment on the right/3p side

    @type  old: PacbPORF
    @param old: PacbPORF instance

    @type  new: PacbPORF
    @param new: PacbPORF instance to take the rigth side from to amalgate onto the other PacbPORF

    @attention: assumes 'new' PacbPORF to be different/longer on the specified side

    @rtype:  PacbPORF
    @return: merged/amalgamated PacbPORF object

    @rtype:  Boolean
    @return: status weather or not the PacbPORFs are merged 
    """
    if side not in ['left','rigth']:
        message = "'side' not left or rigth but '%s'" % side
        raise InproperlyAppliedArgument, message

    stw = StopWatch()
    stw.start()

    # make temporarily unextended PacbPORFs
    old_was_extended, new_was_extended = False, False
    if old.is_extended():
        old.unextend_pacbporf()
        old_was_extended = True
    if new.is_extended():
        new.unextend_pacbporf()
        new_was_extended = True

    # call the merging helper function for the proper side
    if side == 'rigth':
        # do not get log messages from the helper function; it works fine...
        merged, status = _merge_pacbporfs_on_rigth_side(old,new,verbose=False)
        ###if verbose:
        ###    if status: print merged, "MERGED RIGTH"
        ###    else: print "FALSE MERGING rigth STATUS"
    else:
        # do not get log messages from the helper function; it works fine...
        merged, status = _merge_pacbporfs_on_left_side(old,new,verbose=False)
        ###if verbose:
        ###    if status: print merged, "MERGED LEFT"
        ###    else: print "FALSE MERGING left STATUS"

    # reset old and new PacbPORF object to its extention status
    if old_was_extended: merged.extend_pacbporf_after_stops()
    if old_was_extended: old.extend_pacbporf_after_stops()
    if new_was_extended: new.extend_pacbporf_after_stops()
    # verbose logging message when merging was succesfull
    if verbose and status:
         print old, "STARTING SITUATION", side
         #old.print_protein(_linesize=150)
         print merged, "RE-EXTENDED", side
         #merged.print_protein(_linesize=150)

    # return the (freshly merged) pacbporf and the status if it was succesfull
    return merged, status
コード例 #8
0
def merge_pacbporfs(old, new, side, verbose=False):
    """
    Amalgate 2 (near) identical pacbporfs with a slighly different alignment on the right/3p side

    @type  old: PacbPORF
    @param old: PacbPORF instance

    @type  new: PacbPORF
    @param new: PacbPORF instance to take the rigth side from to amalgate onto the other PacbPORF

    @attention: assumes 'new' PacbPORF to be different/longer on the specified side

    @rtype:  PacbPORF
    @return: merged/amalgamated PacbPORF object

    @rtype:  Boolean
    @return: status weather or not the PacbPORFs are merged 
    """
    if side not in ['left', 'rigth']:
        message = "'side' not left or rigth but '%s'" % side
        raise InproperlyAppliedArgument, message

    stw = StopWatch()
    stw.start()

    # make temporarily unextended PacbPORFs
    old_was_extended, new_was_extended = False, False
    if old.is_extended():
        old.unextend_pacbporf()
        old_was_extended = True
    if new.is_extended():
        new.unextend_pacbporf()
        new_was_extended = True

    # call the merging helper function for the proper side
    if side == 'rigth':
        # do not get log messages from the helper function; it works fine...
        merged, status = _merge_pacbporfs_on_rigth_side(old,
                                                        new,
                                                        verbose=False)
        ###if verbose:
        ###    if status: print merged, "MERGED RIGTH"
        ###    else: print "FALSE MERGING rigth STATUS"
    else:
        # do not get log messages from the helper function; it works fine...
        merged, status = _merge_pacbporfs_on_left_side(old, new, verbose=False)
        ###if verbose:
        ###    if status: print merged, "MERGED LEFT"
        ###    else: print "FALSE MERGING left STATUS"

    # reset old and new PacbPORF object to its extention status
    if old_was_extended: merged.extend_pacbporf_after_stops()
    if old_was_extended: old.extend_pacbporf_after_stops()
    if new_was_extended: new.extend_pacbporf_after_stops()
    # verbose logging message when merging was succesfull
    if verbose and status:
        print old, "STARTING SITUATION", side
        #old.print_protein(_linesize=150)
        print merged, "RE-EXTENDED", side
        #merged.print_protein(_linesize=150)

    # return the (freshly merged) pacbporf and the status if it was succesfull
    return merged, status
コード例 #9
0
ファイル: cbgjunction2blastp.py プロジェクト: IanReid/ABFGP
def blastanalysescbgjunction(
        gsg,
        prevCBG,
        nextCBG,
        omit_cbg_orfs=False,
        omit_non_cbg_orfs=False,
        extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
        omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
        verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org, orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf
        for org, orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,
            prevCBG,
            nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
        ).split("\n"))

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org, orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org, orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
                del (fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del (fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):", len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return []

    # check if all organisms are still covered
    orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return []

    # create !single! fasta database
    fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa"
    writeMultiFasta(fastadbmfa, fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ, orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ + "_orf_" + str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       protseq,
                                       fastadbname,
                                       extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS, _orfSid = alignment.title.replace(">",
                                                         "").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS, int(_orfSid))
                orfS = orfs[nodeS]

                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return []

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print[p.bitscore for p in dpcpacbpcol.pacbps.values()]
        print "PCG nodes:", dpcpacbpcol.get_ordered_nodes()
    ################################################################

    #### do some transformations on the pacbpcol
    ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1)
    ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
    ####        edges=gsg.node_count()-1 , max_missing_edges=0 )
    ##### convert to list of CBGs and do some transformations
    ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    ####cbgList.remove_all_but_complete_cbgs()
    ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    ####cbgList.remove_cbgs_without_omsr()
    ####cbgList.update_edge_weights_by_minimal_spanning_range()
    ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2])
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity)
    max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.make_pacbps_for_missing_edges()
    cbgList.remove_all_but_complete_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    # and create_cache() for these CBGs
    for cbg in cbgList:
        cbg.create_cache()

    ####################################################################
    if verbose:
        print stw.lap(), "CBGs created", len(cbgList)
        for newcbg in cbgList:
            print "new:", newcbg
    ####################################################################

    # return list with CBGs
    return cbgList.codingblockgraphs
コード例 #10
0
ファイル: lib_cexpander.py プロジェクト: IanReid/ABFGP
def cexpander2multiplealignment(cxpdr, verbose=False):
    """
    This function and its application are still under development. In future
    version, this cexpander obtained data will replace the (deprecated) PAOC
    and PASC VISTA-like tracks which were far to computationally expensive to
    obtain.
    """
    ########################################################################
    if verbose:
        stw = StopWatch(name="cxpdr2multiplealignment")
        stw.start()
    ########################################################################

    # for each of the _transferblocks (1 for each organism/gene), the
    # binarystring **should** contain an identical number of 1's
    # in freak-accident cases (1 in hundreds of thousand of cases),
    # it is observed that this not the case. Catch this exception here
    # before it hard-crashes with a raise somewhere later in this function
    if len(Set([trf.binarystring.count("1")
                for trf in cxpdr._transferblocks])) > 1:
        print "WARNING: unequal Cexpander.transferblocks.binarystring 1's count:",
        print Set(
            [trf.binarystring.count("1") for trf in cxpdr._transferblocks])
        return False

    # split the cexpander binarystrings on character changes 0->1 and 1->0
    substrings = {}
    orgs = [trf.header for trf in cxpdr._transferblocks]
    for ipos in range(0, len(orgs)):
        org = orgs[ipos]
        trf = cxpdr._transferblocks[ipos]
        substrings[org] = [
            x.group() for x in re.finditer("(1+|0+)", trf.binarystring)
        ]

    # maximum number of blocks in the cexpander output
    # WARNING TODO THIS IS STILL NOT 100% SAFE!!
    try:
        maxblocks = max(
            Set([len(substrings[org]) for org in substrings.keys()]))
    except:
        print "ERROR in cexpander2multiplealignment"
        print substrings.keys()
        print "inputseqs:", len(cxpdr.sequences)
        for k, v in substrings.iteritems():
            print k, len(v)
            print v
        # now raise the error...
        maxblocks = max(
            Set([len(substrings[org]) for org in substrings.keys()]))

    curblock = 0
    ########################################################################
    if verbose:
        print "maxblocks:", maxblocks,
        print[len(substrings[org]) for org in substrings.keys()]
        if len(Set([len(substrings[org]) for org in substrings.keys()])) > 1:
            for ipos in range(0, len(orgs)):
                org = orgs[ipos]
                print org,
                print[
                    Set(substrings[org][block])
                    for block in range(0, len(substrings[org]))
                ]
                trf = cxpdr._transferblocks[ipos]
                print trf.binarystring, len(trf.binarystring),
                print trf.binarystring.count("1"), trf.binarystring.count("0")
    ########################################################################
    while curblock < maxblocks:
        try:
            # create curblocktypeset
            curblocktypeset = Set("".join(
                [substrings[org][curblock] for org in substrings.keys()]))
        except IndexError:
            # substrings[org][curblock](s) IndexError
            # can happen on EOF blocks if some have zeros, others have nothing
            # append empty block; this will be dealth with in the
            # curblocktypeset Set("0")
            for org in substrings.keys():
                if len(substrings[org]) == curblock:
                    substrings[org].append("")
            # recreate curblocktypeset in 2th instance
            curblocktypeset = Set("".join(
                [substrings[org][curblock] for org in substrings.keys()]))

        ########################################################################
        if verbose:
            print "curiter::", curblock, maxblocks,
            print[len(substrings[org][curblock]) for org in substrings.keys()]
        ########################################################################

        if curblocktypeset == Set("1"):
            # block of just ones; settle this block by limiting on minimal length
            # of all organisms of 111-string.
            curblocklengths = Set(
                [len(substrings[org][curblock]) for org in substrings.keys()])
            if len(curblocklengths) == 1:
                pass  # all normal...
            else:
                minlength = min(curblocklengths)
                for org in substrings.keys():
                    if len(substrings[org][curblock]) > minlength:
                        blocklen = len(substrings[org][curblock])
                        substrings[org][curblock] = substrings[org][curblock][
                            0:minlength]
                        substrings[org].insert(curblock + 1,
                                               "1" * (blocklen - minlength))
                        substrings[org].insert(curblock + 1, "")
                # increase maxblocks counter
                maxblocks = max(
                    Set([len(substrings[org]) for org in substrings.keys()]))
                ####################################################################
                if verbose:
                    print "TRBLOCKS CHANGED!, curblock, maxblocks:", curblock, maxblocks,
                    print[len(substrings[org]) for org in substrings.keys()]
                    for ipos in range(0, len(orgs)):
                        org = orgs[ipos]
                        print org,
                        print[
                            Set(substrings[org][block])
                            for block in range(0, len(substrings[org]))
                        ]
                ####################################################################
        elif curblocktypeset == Set("0"):
            # check lengths of the blocks
            lengths = [
                len(substrings[org][curblock]) for org in substrings.keys()
            ]
            for org in substrings.keys():
                if len(substrings[org][curblock]) != max(lengths):
                    substrings[org][curblock] += "." * (
                        max(lengths) - len(substrings[org][curblock]))
        elif curblocktypeset == Set(["0", "1"]):
            # situation where frontal or intermediate zeros complicate the multiplealignment
            for org in substrings.keys():
                if Set(substrings[org][curblock]) == Set(['1']):
                    substrings[org].insert(curblock, "")
            # next, do as if cublocktypeset == Set("0") (which it is now!
            # check lengths of the blocks
            lengths = [
                len(substrings[org][curblock]) for org in substrings.keys()
            ]
            for org in substrings.keys():
                if len(substrings[org][curblock]) != max(lengths):
                    substrings[org][curblock] += "." * (
                        max(lengths) - len(substrings[org][curblock]))
        else:
            print "MIXED!!", curblocktypeset, "curblock:", curblock, "maxblocks:", maxblocks
            print "ERROR WILL LIKELY OCCUR QUICKLY AFTER HERE..."
            pass
            import sys
            sys.exit()

        # increase the blocks counter
        curblock += 1
    ########################################################################
    if verbose:
        for org in substrings.keys():
            # print the sequence itself
            for block in range(0, maxblocks):
                offset = sum([
                    substrings[org][i].count("1") +
                    substrings[org][i].count("0") for i in range(0, block)
                ])
                blocklen = len(substrings[org][block])
                if Set(substrings[org][block]) == Set("1"):
                    print cxpdr.sequences[org][offset:offset +
                                               blocklen].upper(),
                elif Set(substrings[org][block]) == Set("0"):
                    print cxpdr.sequences[org][offset:offset +
                                               blocklen].lower(),
                else:
                    gaps = substrings[org][block].count(".")
                    nongaps = blocklen - gaps
                    print cxpdr.sequences[org][offset:offset +
                                               nongaps].lower() + "-" * gaps,
            print org
            for block in range(0, maxblocks):
                print substrings[org][block],
            print org
    ########################################################################
    if verbose:
        for block in range(0, maxblocks):
            if substrings[substrings.keys()[0]][block].count("1") > 0: continue
            for org in substrings.keys():
                offset = sum([
                    substrings[org][i].count("1") +
                    substrings[org][i].count("0") for i in range(0, block)
                ])
                blocklen = len(substrings[org][block])
                if Set(substrings[org][block]) == Set("1"):
                    print cxpdr.sequences[org][offset:offset +
                                               blocklen].upper(),
                elif Set(substrings[org][block]) == Set("0"):
                    print cxpdr.sequences[org][offset:offset +
                                               blocklen].lower(),
                else:
                    gaps = substrings[org][block].count(".")
                    nongaps = blocklen - gaps
                    print cxpdr.sequences[org][offset:offset +
                                               nongaps].lower() + "-" * gaps,
                print substrings[org][block],
                print org
    ########################################################################
    if verbose:
        for org in substrings.keys():
            print org, "\t",
            for block in range(0, maxblocks):
                print len(substrings[org][block]),
                if substrings[org][block].count("1") == 0:
                    print "(%s,%s)" % (substrings[org][block].count('0'),
                                       substrings[org][block].count('.')),
            print "\t\t", sum(
                [len(substrings[org][block]) for block in range(0, maxblocks)])
        print stw.lap()
    ########################################################################
    return substrings
コード例 #11
0
ファイル: cbgjunction2blastp.py プロジェクト: IanReid/ABFGP
def blastanalysescbgjunction(gsg,prevCBG,nextCBG,
    omit_cbg_orfs = False,
    omit_non_cbg_orfs = False,
    extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
    omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
    verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org,orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf
        for org,orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,prevCBG,nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
            ).split("\n")
        )

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org,orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org,orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
               del(fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del(fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):",len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return [] 

    # check if all organisms are still covered
    orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return [] 

    # create !single! fasta database
    fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa"
    writeMultiFasta(fastadbmfa,fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ,orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ+"_orf_"+str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname,
                    extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS,_orfSid = alignment.title.replace(">","").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS,int(_orfSid))
                orfS  = orfs[nodeS]
               
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = ( orgQ, orfQ.id )
                nodeS = ( orgS, orfS.id )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return [] 

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]
コード例 #12
0
    def _recrute_pacbporfs_from_parental_cbg(self,
                                             parentcbg,
                                             create_cache=True,
                                             ignore_nonexisting_edges=False,
                                             verbose=False):
        """
	Harvest PacbPORFs from (parental) CodingBlockGraph

	@attention: alternative for harvest_pacbps_from_crossdata()
        @attention: required in _place_cbg_in_partialgsg() function
        @attention: use create_cache=False with care!

	@type  parentcbg: CodingBlockGraph
	@param parentcbg: CodingBlockGraph that has to delived PacbPORFs

	@type  create_cache: Boolean
	@param create_cache: run the create_cache() function on the CBG (self)

	@type  ignore_nonexisting_edges: Boolean
	@param ignore_nonexisting_edges: when False, do not create edges in the
					 CBG (self) that are absent (but present
					 in the parentcbg)

        @type  verbose: Boolean
        @param verbose: print debugging information to STDOUT when True
        """
        replacements = {}
        substituted = 0

        ####################################################################
        if verbose:
            stw = StopWatch("recruteParentalPacbps")
            print stw.start()
            print "target:", self
            print "source:", parentcbg
        ####################################################################

        for (node1, node2) in self.pairwisecrosscombinations_node():
            # if this edge is not present in the parent, ignore it
            if not parentcbg.has_edge(node1, node2): continue
            # get PacbPORF of the parent
            origpacbporf = parentcbg.get_pacbps_by_nodes(node1=node1,
                                                         node2=node2)[0]
            curpacbporf = None
            replace_pacbporf = False
            if not self.has_edge(node1, node2):
                if ignore_nonexisting_edges:
                    # if ignore_nonexisting_edges -> do not recrute this pacbp
                    continue
                else:
                    # replace this Pacbporf if it exists and
                    # simultaniously create novel edge
                    replace_pacbporf = True
            elif self.has_edge(node1,node2) and not\
     self.get_pacbps_by_nodes(node1=node1,node2=node2):
                replace_pacbporf = True
            else:
                curpacbporf = self.get_pacbps_by_nodes(node1=node1,
                                                       node2=node2)[0]
                if pacb.comparison.IsIdenticalPacbPORF(origpacbporf,
                                                       curpacbporf):
                    # Pacbporfs are already identical; not relevant to copy
                    continue
                if origpacbporf.issuperset(curpacbporf):
                    # store to replacements dict
                    replacements[(node1, node2)] = curpacbporf
                    # remove from the CBG -> replacement in progress
                    self.remove_pacbp(curpacbporf, node1, node2)
                    replace_pacbporf = True

            # check if replace_pacbporf is set to True
            if replace_pacbporf:
                ################################################################
                if verbose:
                    print stw.lap(), "REPLACING PacbPORF Source->Target:"
                    print "T:", curpacbporf, "(current)"
                    print "S:", origpacbporf
                    origpacbporf.print_protein(_linesize=100)
                ################################################################
                newkey = origpacbporf.construct_unique_key(node1, node2)
                self.set_edge_weight(node1, node2, wt=origpacbporf.bitscore)
                self.pacbps[(newkey, node1, node2)] = origpacbporf
                substituted += 1

        # check if substitutions have been taken place
        if create_cache and substituted:
            #####################################################################
            if verbose:
                print stw.lap(), "CREATE_CACHE & substituted PacbPORFS:",
                print substituted, "edges:", len(self.weights) / 2,
                print "pacbps:", len(self.pacbps)
                ####for k,pacbporf in self.pacbps.iteritems():
                ####    print k,"\n",pacbporf
            #####################################################################
            self.clear_cache()
            # check if there is an OMSR upon recreation; in very
            # exceptional cases, OMSR can get lost in this step
            if self.has_overall_minimal_spanning_range():
                self.create_cache()
                self.update_edge_weights_by_minimal_spanning_range()
            else:
                #############################################################
                if verbose:
                    print stw.lap(), "OMSR got lost!",
                    print "replacements:", len(replacements)
                    for (n1, n2), curpacbporf in replacements.iteritems():
                        print "REP:", curpacbporf, n1, n2
                #############################################################
                # OMSR got lost! Restore replacements dict and as such
                # restore the original PacbPs one by one (in random order)
                # and quit as soon as an OMSR is restored
                for (node1, node2), curpacbporf in replacements.iteritems():
                    newkey = curpacbporf.construct_unique_key(node1, node2)
                    tobereplpacbporf = self.get_pacbps_by_nodes(node1=node1,
                                                                node2=node2)[0]
                    # remove from the CBG
                    self.remove_pacbp(tobereplpacbporf, node1, node2)
                    # and place back the original one
                    self.set_edge_weight(node1, node2, wt=curpacbporf.bitscore)
                    self.pacbps[(newkey, node1, node2)] = curpacbporf
                    substituted -= 1
                    if self.has_overall_minimal_spanning_range():
                        self.create_cache()
                        self.update_edge_weights_by_minimal_spanning_range()
                        #########################################################
                        if verbose:
                            print stw.lap(), "OMSR restored, substitutions:",
                            print substituted
                            print "T:", self
                        ##########################################################
                        # break out of the for loop of PacbP replacement
                        break

        # return number of replaced/added pacbporfs
        return substituted