Beispiel #1
0
    def _parsefilehandle(self,filehandle):
        """
	Parse getorf output file(handle) to Orf objects in OrfSet

        @type  filehandle: filehandle
        @param filehandle: filehandle to opened getorf output file
        """
        self._dict_to_orfs( parseFasta(filehandle.readlines() ) )
Beispiel #2
0
    def _parsefilehandle(self, filehandle):
        """
	Parse getorf output file(handle) to Orf objects in OrfSet

        @type  filehandle: filehandle
        @param filehandle: filehandle to opened getorf output file
        """
        self._dict_to_orfs(parseFasta(filehandle.readlines()))
Beispiel #3
0
    def getorfs(self, strand="+"):
        """
        Run & Parse EMBOSS getorf from self.sequence 

        @attention: implemented for ORFs from STOP to STOP

        @type  strand: string
        @param strand: strand of the DNA sequence (default '+')
        """
        # input validation
        IsProperStrand(strand)
        self.strand = strand

        # parse the ouput file
        seqs = parseFasta(getorf(sequence=self.sequence).split("\n"))
        self._dict_to_orfs(seqs)
Beispiel #4
0
    def getorfs(self,strand="+"):
        """
        Run & Parse EMBOSS getorf from self.sequence 

        @attention: implemented for ORFs from STOP to STOP

        @type  strand: string
        @param strand: strand of the DNA sequence (default '+')
        """
        # input validation
        IsProperStrand(strand)
        self.strand = strand

        # parse the ouput file
        seqs = parseFasta( getorf( sequence=self.sequence ).split("\n") )
	self._dict_to_orfs( seqs )
Beispiel #5
0
def _create_hmm_db(organism,
                   inputdict,
                   cbg,
                   prev,
                   next,
                   orf_must_have_start=False,
                   max_intron_nt_length=200,
                   verbose=False):
    """
    Create fasta ORF database for a organism in a CBG and its viscinity

    @type  organism: * (presumably string)
    @param organism: Organism identifier recognizable in <input data structure>

    @type  inputdict: dict 
    @param inputdict: <input data structure> 

    @type  cbg: CodingBlockGraph or related object
    @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed

    @type  prev: CodingBlockGraph or related object (or None)
    @param prev: CodingBlockGraph upstream/5p of cbg that must be completed

    @type  next: CodingBlockGraph or related object (or None)
    @param next: CodingBlockGraph downstream/3p of cbg that must be completed

    @attention: `prev` and `next` CodingBlockGraphs reduce the search space of
                ORFs to scan with the HMM profile. This Speeds up and
                improves the quality of results.

    @type  orf_must_have_start: Boolean
    @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: positive maximum intron length to take
                                 into acount when selecting suitable ORFs

    @type  verbose: Boolean
    @param verbose: report debugging-report on STDOUT (True) or be quiet (False)
    """

    # fullpath filename of result hmm multi fasta database
    fname_hmm_db_mfa = None
    if not cbg: return fname_hmm_db_mfa

    # (1) try to limit searchspace by prev and next CBG
    prevNode, nextNode = None, None
    prevMin, nextMax = None, None
    maskcoords = []

    # (1a) check if (informant) organism is in the prev CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if prev and organism in prev.organism_set() and\
    prev.has_overall_minimal_spanning_range():
        prevNode = prev.node_by_organism(organism)
        try:
            omsr = prev.overall_minimal_spanning_range(organism=organism)
            prevMin = (max(omsr) + 1) * 3
            maskcoords.append((0, max(omsr)))
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass

    # (1b) check if (informant) organism is in the next CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if next and organism in next.organism_set() and\
    next.has_overall_minimal_spanning_range():
        nextNode = next.node_by_organism(organism)
        try:
            omsr = next.overall_minimal_spanning_range(organism=organism)
            nextMax = min(omsr) * 3
            aaseqlen = len(inputdict[organism]['genomeseq']) / 3
            maskcoords.append((min(omsr), aaseqlen))
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass

    # (1c) limit search space if only prev or next was specified
    if not prev and next and nextMax:
        prevMin = nextMax - max_intron_nt_length
    if not next and prev and prevMin:
        nextMax = prevMin + max_intron_nt_length

    # (2a) get elegiable sets of orfs from prev and next
    if not orf_must_have_start:
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
            min_orf_end=prevMin, max_orf_start=nextMax)
    else:
        # ORFs *must* have starts => searching for a TSS exon/CBG
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
            min_orf_end=prevMin, max_orf_start=nextMax, has_starts=True)

    # (2b) check orf count; can be zero in case of a very tiny region to check
    if not elegiable_orfs: return fname_hmm_db_mfa

    # (3) write masked orfs to fasta database multi line string
    db_fasta = inputdict[organism]['orfs'].tomaskedfasta(
        coords=maskcoords, orflist=elegiable_orfs, header_prefix=organism)
    if orf_must_have_start:
        if len(db_fasta.strip()) == 0:
            # no UNmasked suitable ORFs remaining!
            # This is recognized lateron in this function
            pass
        else:
            # mask out all AAs before the first start
            lines = db_fasta.split("\n")
            for linenr in range(0, len(lines)):
                line = lines[linenr]
                if line[0] != ">":
                    mpos = line.find("M")
                    if mpos > 0:
                        line = "X" * mpos + line[mpos:]
                    lines[linenr] = line
            # recreate db_fasta string
            db_fasta = "\n".join(lines)

    ############################################################################
    if verbose:
        if len(elegiable_orfs) > 10:
            orfidlist = len(elegiable_orfs)
        else:
            orfidlist = [orf.id for orf in elegiable_orfs]
        print "hmm-elegibable orfs:", organism, orfidlist, "/",
        print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin,
        if prev:
            print prev.has_overall_minimal_spanning_range(),
        else:
            print None,
        print "nextMax:", nextMax,
        if next:
            print next.has_overall_minimal_spanning_range()
        else:
            print None
    ############################################################################

    # (4) make unique filename for hmm database file
    fname_base = get_random_string_tag()
    fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base, organism)

    # (5) write masked orfs to fasta database
    fh = open(fname_hmm_db_mfa, 'w')
    fh.write(db_fasta)
    fh.close()

    # (6) make shure that there where orfs written to file;
    # in case very little orfs are selected and all are masked -> no files!
    seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines())
    if not seqs_in_db:
        # delete this (empty) file
        osRemove(fname_hmm_db_mfa)
        return None

    # (7) return hmm search database filename
    return fname_hmm_db_mfa
Beispiel #6
0
        return ntcoord


# end of function _obtain_unigene_gff_nt_end_coord

# get fasta by gff, remove headers, concatenate into single DNA seq and make 3-frame translation
command = """%s %s %s | grep -v "^>" | tr -d " \n" | %s -filter -frame %s """ % (
    PYTHON_PATH, EXECUTABLE_GFF2FASTA, fastafile, EXECUTABLE_TRANSEQ,
    transeq_frame)

if python_version <= 2.5:
    # Python2.4 os.popen syntax
    ci, co = os.popen2(command)
    ci.write("\n".join(unigenegffexons))
    ci.close()
    frametrans = parseFasta(co.readlines())
    co.close()
else:
    # Python2.6 subprocess.Popen syntax
    from subprocess import Popen, PIPE
    p = Popen(command, shell=True, stdin=PIPE, stdout=PIPE, close_fds=True)
    (child_stdin, child_stdout) = (p.stdin, p.stdout)
    child_stdin.write("\n".join(unigenegffexons))
    child_stdin.close()
    frametrans = parseFasta(child_stdout.readlines())

# command to get DNA sequence of unigene exon(s)
command = """%s %s %s | grep -v "^>" """ % (
    PYTHON_PATH,
    EXECUTABLE_GFF2FASTA,
    fastafile,
Beispiel #7
0
def blastanalysescbgjunction(
        gsg,
        prevCBG,
        nextCBG,
        omit_cbg_orfs=False,
        omit_non_cbg_orfs=False,
        extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
        omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
        verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org, orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf
        for org, orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org, orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,
            prevCBG,
            nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
        ).split("\n"))

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):", len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org, orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org, orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
                del (fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del (fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):", len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):", len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return []

    # check if all organisms are still covered
    orgSet = Set([k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return []

    # create !single! fasta database
    fastadbname = prevCBG.barcode() + "_" + nextCBG.barcode() + ".mfa"
    writeMultiFasta(fastadbmfa, fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph()  # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ, orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ, orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ + "_orf_" + str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ + "_orf_" + str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,
                                       protseq,
                                       fastadbname,
                                       extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS, _orfSid = alignment.title.replace(">",
                                                         "").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS, int(_orfSid))
                orfS = orfs[nodeS]

                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                    pacb.PacbP(blastp_hsp=hsp), orfQ, orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ, orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = (orgQ, orfQ.id)
                nodeS = (orgS, orfS.id)
                uqkey = pacbporf.construct_unique_key(nodeQ, nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ, nodeS, wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey, nodeQ, nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([fname + ".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return []

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes(pacbpcol.get_nodes())
    for (uqkey, nodeQ, nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore, length, orfQid, orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ, nodeS, wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print[p.bitscore for p in dpcpacbpcol.pacbps.values()]
        print "PCG nodes:", dpcpacbpcol.get_ordered_nodes()
    ################################################################

    #### do some transformations on the pacbpcol
    ####pacbpcol.remove_low_connectivity_nodes(min_connectivity=gsg.EXACT_SG_NODE_COUNT-1)
    ####splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
    ####        edges=gsg.node_count()-1 , max_missing_edges=0 )
    ##### convert to list of CBGs and do some transformations
    ####cbgList = ListOfCodingBlockGraphs(splittedCBGs,input={},crossdata={})
    ####cbgList.remove_all_but_complete_cbgs()
    ####cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    ####cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    ####cbgList.remove_cbgs_without_omsr()
    ####cbgList.update_edge_weights_by_minimal_spanning_range()
    ####cbgList.order_list_by_attribute(order_by='total_weight',reversed=True)

    min_connectivity = max([1, gsg.EXACT_SG_NODE_COUNT - 1 - 2])
    pacbpcol.remove_low_connectivity_nodes(min_connectivity=min_connectivity)
    max_missing_edges = gsg.EXACT_SG_NODE_COUNT - 3
    splittedCBGs = pacbpcol.find_fully_connected_subgraphs(
        edges=gsg.node_count() - 1, max_missing_edges=max_missing_edges)
    # convert to list of CBGs and do some transformations
    cbgList = ListOfCodingBlockGraphs(splittedCBGs, input={}, crossdata={})
    cbgList.remove_all_but_cbgs()
    cbgList.harvest_pacbps_from_pacbpcollection(dpcpacbpcol)
    cbgList.make_pacbps_for_missing_edges()
    cbgList.remove_all_but_complete_cbgs()
    cbgList.remove_cbgs_with_lt_nodes(gsg.EXACT_SG_NODE_COUNT)
    cbgList.remove_cbgs_without_omsr()
    cbgList.update_edge_weights_by_minimal_spanning_range()
    cbgList.order_list_by_attribute(order_by='total_weight', reversed=True)

    # and create_cache() for these CBGs
    for cbg in cbgList:
        cbg.create_cache()

    ####################################################################
    if verbose:
        print stw.lap(), "CBGs created", len(cbgList)
        for newcbg in cbgList:
            print "new:", newcbg
    ####################################################################

    # return list with CBGs
    return cbgList.codingblockgraphs
Beispiel #8
0
def _create_hmm_db(organism,inputdict,cbg,prev,next,
    orf_must_have_start=False,max_intron_nt_length=200,
    verbose=False):
    """
    Create fasta ORF database for a organism in a CBG and its viscinity

    @type  organism: * (presumably string)
    @param organism: Organism identifier recognizable in <input data structure>

    @type  inputdict: dict 
    @param inputdict: <input data structure> 

    @type  cbg: CodingBlockGraph or related object
    @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed

    @type  prev: CodingBlockGraph or related object (or None)
    @param prev: CodingBlockGraph upstream/5p of cbg that must be completed

    @type  next: CodingBlockGraph or related object (or None)
    @param next: CodingBlockGraph downstream/3p of cbg that must be completed

    @attention: `prev` and `next` CodingBlockGraphs reduce the search space of
                ORFs to scan with the HMM profile. This Speeds up and
                improves the quality of results.

    @type  orf_must_have_start: Boolean
    @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: positive maximum intron length to take
                                 into acount when selecting suitable ORFs

    @type  verbose: Boolean
    @param verbose: report debugging-report on STDOUT (True) or be quiet (False)
    """

    # fullpath filename of result hmm multi fasta database
    fname_hmm_db_mfa = None
    if not cbg: return fname_hmm_db_mfa

    # (1) try to limit searchspace by prev and next CBG
    prevNode, nextNode = None, None
    prevMin,  nextMax  = None, None
    maskcoords = []

    # (1a) check if (informant) organism is in the prev CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if prev and organism in prev.organism_set() and\
    prev.has_overall_minimal_spanning_range():
        prevNode = prev.node_by_organism(organism)
        try:
            omsr = prev.overall_minimal_spanning_range(organism=organism)
            prevMin = (max(omsr)+1)*3
            maskcoords.append( ( 0, max(omsr) ) )
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass


    # (1b) check if (informant) organism is in the next CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if next and organism in next.organism_set() and\
    next.has_overall_minimal_spanning_range():
        nextNode = next.node_by_organism(organism)
        try:
            omsr = next.overall_minimal_spanning_range(organism=organism)
            nextMax = min(omsr)*3
            aaseqlen = len(inputdict[organism]['genomeseq'])/3
            maskcoords.append( ( min(omsr), aaseqlen ) )
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass

    # (1c) limit search space if only prev or next was specified
    if not prev and next and nextMax:
        prevMin = nextMax - max_intron_nt_length
    if not next and prev and prevMin:
        nextMax = prevMin + max_intron_nt_length 

    # (2a) get elegiable sets of orfs from prev and next
    if not orf_must_have_start:
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
                min_orf_end = prevMin, max_orf_start = nextMax
                )
    else:
        # ORFs *must* have starts => searching for a TSS exon/CBG
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
                min_orf_end = prevMin, max_orf_start = nextMax,
                has_starts=True
                )

    # (2b) check orf count; can be zero in case of a very tiny region to check
    if not elegiable_orfs: return fname_hmm_db_mfa

    # (3) write masked orfs to fasta database multi line string
    db_fasta = inputdict[organism]['orfs'].tomaskedfasta(
            coords=maskcoords,
            orflist=elegiable_orfs,
            header_prefix=organism) 
    if orf_must_have_start:
        if len(db_fasta.strip()) == 0:
            # no UNmasked suitable ORFs remaining!
            # This is recognized lateron in this function 
            pass
        else:
            # mask out all AAs before the first start
            lines = db_fasta.split("\n")
            for linenr in range(0,len(lines)):
                line = lines[linenr]
                if line[0] != ">":
                    mpos = line.find("M")
                    if mpos > 0:
                        line = "X"*mpos+line[mpos:]
                    lines[linenr] = line
            # recreate db_fasta string
            db_fasta = "\n".join(lines)

    ############################################################################
    if verbose:
        if len(elegiable_orfs) > 10:
            orfidlist = len(elegiable_orfs)
        else:
            orfidlist = [ orf.id for orf in elegiable_orfs ]
        print "hmm-elegibable orfs:", organism, orfidlist, "/",
        print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin,
        if prev:
            print prev.has_overall_minimal_spanning_range(),
        else:
            print None,
        print "nextMax:", nextMax,
        if next:
            print next.has_overall_minimal_spanning_range()
        else:
            print None
    ############################################################################

    # (4) make unique filename for hmm database file
    fname_base = get_random_string_tag()
    fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base,organism)

    # (5) write masked orfs to fasta database
    fh = open(fname_hmm_db_mfa,'w')
    fh.write( db_fasta )
    fh.close()

    # (6) make shure that there where orfs written to file;
    # in case very little orfs are selected and all are masked -> no files!
    seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines())
    if not seqs_in_db:
        # delete this (empty) file
        osRemove( fname_hmm_db_mfa )
        return None

    # (7) return hmm search database filename
    return fname_hmm_db_mfa
Beispiel #9
0
def blastanalysescbgjunction(gsg,prevCBG,nextCBG,
    omit_cbg_orfs = False,
    omit_non_cbg_orfs = False,
    extra_blastp_params=CBG_JUNCTION_BLAST2PACBPCOL_EXTRA_BLASTP_PARAMS,
    omsr_2_mask_aa_length_correction=CBG_JUNCTION_BLAST2PACBPCOL_OMSR_2_AA_MASK,
    verbose=False):
    """
    """
    ############################################################
    if verbose:
        stw = StopWatch('blastanalysescbgjunction')
        stw.start()
    ############################################################
    orfs = {}
    if not omit_cbg_orfs:
        # gather Orfs from prevCBG and nextCBG
        for org,orflist, in prevCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf
        for org,orflist, in nextCBG.get_orfs_of_graph().iteritems():
            orf = orflist[0]
            orfs[(org,orf.id)] = orf

    ############################################################
    if verbose:
        print stw.lap(), "orfs (1):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # create masked fasta database in a dict
    fastadbmfa = parseFasta(
        create_hmmdb_for_neighbouring_cbgs(
            gsg.input,prevCBG,nextCBG,
            omsr_2_mask_aa_length_correction=omsr_2_mask_aa_length_correction,
            ).split("\n")
        )

    ############################################################
    if verbose: print stw.lap(), "fasta db (1):",len(fastadbmfa)
    ############################################################

    # remove ORFs that do not belong to prevCBG and nextCBG,
    # or that DO belong to prevCBG and nextCBG, or neither
    fastaheaders = fastadbmfa.keys()
    for header in fastaheaders:
        org,orfid = header.split("_orf_")
        orfid = int(orfid)
        node = (org,orfid)

        # check for the omit_non_cbg_orfs criterion
        add_orf = False
        if omit_non_cbg_orfs:
            if node not in orfs:
               del(fastadbmfa[header])
        else:
            add_orf = True

        # check for the omit_cbg_orfs criterion
        if omit_cbg_orfs and node in orfs:
            del(fastadbmfa[header])

        if add_orf:
            # get this Orf and add to orfs
            orfs[node] = gsg.input[org]['orfs'].get_orf_by_id(orfid)

    ############################################################
    if verbose:
        print stw.lap(), "fasta db (2):",len(fastadbmfa)
        print _format_fastadbmfa_nodes_to_string(fastadbmfa.keys())
    ############################################################

    ############################################################
    if verbose:
        print stw.lap(), "orfs (2):",len(orfs)
        print _format_orf_nodes_to_string(orfs.keys())
    ############################################################

    # no query/sbjct range left at all
    if not fastadbmfa: return [] 

    # check if all organisms are still covered
    orgSet = Set([ k.split("_orf_")[0] for k in fastadbmfa.keys()])
    if orgSet.symmetric_difference(gsg.organism_set()):
        return [] 

    # create !single! fasta database
    fastadbname = prevCBG.barcode()+"_"+nextCBG.barcode()+".mfa"
    writeMultiFasta(fastadbmfa,fastadbname)
    formatdb(fname=fastadbname)

    # remap the identifiers of the orf objects i.o.t....
    multifastas = {}
    blastdbs = {}
    pacbpcol    = PacbpCollectionGraph()
    dpcpacbpcol = PacbpCollectionGraph() # ``deepcopied`` variant for pacbps

    ############################################################
    if verbose: print stw.lap(), "blastp starting"
    ############################################################

    for orgQ,orgS in prevCBG.pairwisecrosscombinations_organism():

        for nodeQ,orfQ in orfs.iteritems():
            # only blast the (masked) Orfs of orgQ
            if prevCBG.organism_by_node(nodeQ) != orgQ: continue
            # get the masked protein sequence of this orfObj
            header = orgQ+"_orf_"+str(orfQ.id)
            # check if key exists in fastadbmfa. In a case where
            # an Orf is masked out completely, it is absent here!
            if not fastadbmfa.has_key(header): continue
            protseq = fastadbmfa[orgQ+"_orf_"+str(orfQ.id)]
            # run blast_seqs2db
            blastrec = blastall_seq2db(orfQ.id,protseq,fastadbname,
                    extra_blastp_params=extra_blastp_params)
            # omit empty blast records
            if len(blastrec.alignments) == 0: continue

            for alignment in blastrec.alignments:
                # get sbjct Org and Orf identifiers
                _orgS,_orfSid = alignment.title.replace(">","").split("_orf_")
                if _orgS != orgS: continue
                nodeS = (_orgS,int(_orfSid))
                orfS  = orfs[nodeS]
               
                # take only the *best* HSP (highest scoring first one)
                hsp = alignment.hsps[0]

                # correct to absolute positions
                hsp.query_start = hsp.query_start + orfQ.protein_startPY
                hsp.sbjct_start = hsp.sbjct_start + orfS.protein_startPY

                # initialize the PacbP
                pacbporf = pacb.conversion.pacbp2pacbporf(
                        pacb.PacbP(blastp_hsp=hsp),orfQ,orfS)

                ################################################################
                if verbose:
                    print pacbporf, orgQ,orgS, orfQ
                    print pacbporf.query
                    print pacbporf.match
                    print pacbporf.sbjct
                ################################################################

                # create nodes; ( Organism Identifier, Orf Identifier )
                nodeQ = ( orgQ, orfQ.id )
                nodeS = ( orgS, orfS.id )
                uqkey = pacbporf.construct_unique_key(nodeQ,nodeS)
                if not nodeQ in pacbpcol.get_nodes(): pacbpcol.add_node(nodeQ)
                if not nodeS in pacbpcol.get_nodes(): pacbpcol.add_node(nodeS)
                pacbpcol.add_edge(nodeQ,nodeS,wt=pacbporf.bitscore)
                # store to dpcpacbpcol -> pacbpcol is broken in pieces lateron!
                dpcpacbpcol.pacbps[(uqkey,nodeQ,nodeS)] = pacbporf

    ############################################################
    if verbose: print stw.lap(), "blastp done"
    ############################################################

    # file cleanup
    _file_cleanup(multifastas.values())
    _file_cleanup(["formatdb.log"])
    _file_cleanup([ fname+".*" for fname in blastdbs.values()])

    # check if all Organism/Gene identifiers are covered in PacbPs
    if not pacbpcol.organism_set_size() == gsg.organism_set_size():
        return [] 

    # ``deepcopy`` PacbPcollection pacbpcol to dpcpacbpcol
    # In dpcpacbpcol the actual PacbPORFs are stores & kept,
    # whereas pacbpcol itself is splitted in CBGs (which
    # function does not yet (!?) take the actual pacbps into account)
    dpcpacbpcol.add_nodes( pacbpcol.get_nodes() )
    for (uqkey,nodeQ,nodeS) in dpcpacbpcol.pacbps.keys():
        (bitscore,length,orfQid,orfSid) = uqkey
        dpcpacbpcol.add_edge(nodeQ,nodeS,wt=bitscore)

    ################################################################
    if verbose:
        print pacbpcol
        print "PCG bitscores:",
        print [ p.bitscore for p in dpcpacbpcol.pacbps.values() ]
Beispiel #10
0

# get fasta by gff, remove headers, concatenate into single DNA seq and make 3-frame translation
command = """%s %s %s | grep -v "^>" | tr -d " \n" | %s -filter -frame %s """ % (
        PYTHON_PATH,
        EXECUTABLE_GFF2FASTA,
        fastafile,
        EXECUTABLE_TRANSEQ,
        transeq_frame )

if python_version <= 2.5:
    # Python2.4 os.popen syntax
    ci,co = os.popen2(command)
    ci.write("\n".join(unigenegffexons))
    ci.close()
    frametrans = parseFasta( co.readlines() )
    co.close()
else:
    # Python2.6 subprocess.Popen syntax
    from subprocess import Popen, PIPE
    p = Popen(command,shell=True,stdin=PIPE, stdout=PIPE, close_fds=True)
    (child_stdin, child_stdout) = (p.stdin, p.stdout)
    child_stdin.write("\n".join(unigenegffexons))
    child_stdin.close()
    frametrans = parseFasta( child_stdout.readlines() )


# command to get DNA sequence of unigene exon(s)
command = """%s %s %s | grep -v "^>" """ % (
        PYTHON_PATH,
        EXECUTABLE_GFF2FASTA,