Ejemplo n.º 1
0
def blastall_seq2seq(fastadata=(),filenames=(),output="ncbiparsed",blastprogram="blastp",remove_files=True,extra_blastp_params={'F': 'F', 'e': '10'}):
    """
    choose proper input:
    fastadata   ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) )
     or
    filenames   ( filenameQUERY, filenameSBJCT )
    """
    input = None

    if blastprogram not in ['blastp','tblastn','tblastx','blastx']:
        raise "only blastp and tblastn are supported"
    elif blastprogram in ['tblastn','tblastx']:
        dna_or_prot = "F"
    else:
        dna_or_prot = "T"

    if fastadata and type(fastadata) == type(()) and len(fastadata) == 2 and not filenames:
        # input is fasta headers and sequence
        input = "fastadata"
        # write input filenames
        uniquetag = get_random_string_tag()
        fname_q = "_".join( [ uniquetag, str(fastadata[0][0]), 'Q.fa' ] )
        fname_s = "_".join( [ uniquetag, str(fastadata[1][0]), 'S.fa' ] )
        fh = open(fname_q,'w')
        fh.write(">%s\n%s" % (fastadata[0][0],fastadata[0][1]))
        fh.close()
        fh = open(fname_s,'w')
        fh.write(">%s\n%s" % (fastadata[1][0],fastadata[1][1]))
        fh.close()
    elif filenames and type(filenames) == type(()) and len(filenames) == 2 and not fastadata:
        # input is (supposed to be) filenames
        input = "filenames"
        # get filenames
        fname_q = filenames[0]
        fname_s = filenames[1]
    elif not filenames and not fastadata:
        raise "no input!"
    else:
        raise "inproper input!"

    # formatdb
    OSsystem("%s -i %s -p %s" % (FORMATDB_PATH,fname_s,dna_or_prot))
    # and blastall!
    extra_params = " ".join(["-%s %s" % (k,v) for k,v in extra_blastp_params.iteritems()])
    ci,co,ce = osPopen3("%s -p %s %s -i %s -d %s " % (BLASTALL_PATH,blastprogram,extra_params,fname_q,fname_s))
    ci.close()
    if output == "ncbiparsed":
        b_parser = NCBIStandalone.BlastParser()
        blastallout = b_parser.parse(co)
    else:
        blastallout = co.read()
    co.close()
    ce.close()
    if remove_files:
        OSsystem("rm %s.*" % fname_s)
        osRemove("%s" % fname_s)
        osRemove("%s" % fname_q)
    # and return!
    return blastallout
Ejemplo n.º 2
0
def clustalw(inputfile="", seqs={}, remove_inputfile=True, params={}):
    """
    """
    if inputfile and seqs:
        raise "wrong usage!"
    elif inputfile and not seqs:
        # input is (hopefully) a filename
        pass
    elif not inputfile and seqs:
        # input is (hopefully) sequences
        # do a quick check if (sequence) strings are given
        ARE_ALL_STRINGS = True
        for header, seq in seqs.iteritems():
            if not seq:
                ARE_ALL_STRINGS = False
                break
        if not ARE_ALL_STRINGS:
            raise Exception, "no sequence string(s) specified: %s" % seqs
        # make a kind of semi-unique filename
        uniqueid = get_random_string_tag()
        inputfile = uniqueid + "_" + "_".join(
            [_nonstringheader2stringheader(hdr) for hdr in seqs.keys()[0:5]])
        inputfile += ".mfa"
        writeMultiFasta(seqs, inputfile)
    else:
        # no input at all
        raise "no input specified"

    # okay, do the clustalw
    fname_in = inputfile
    # get hard-assigned parameters
    paramstring = " ".join(["-%s=%s" % (k, v) for k, v in params.iteritems()])
    ci, co = osPopen2("%s %s %s" %
                      (EXECUTABLE_CLUSTALW, fname_in, paramstring))
    ci.close()
    clwout = co.read()
    co.close()
    # abstract output filenames from input filename
    if fname_in.find(".") == -1:
        fname_out = fname_in + ".aln"
        fname_tree = fname_in + ".dnd"
    else:
        _base = fname_in[0:fname_in.rfind(".")]
        fname_out = _base + ".aln"
        fname_tree = _base + ".dnd"

    # parse alignment output file
    _seqs, _alignment = _parse_clustalw(fname_out)
    # and delete tmp. created files
    osRemove(fname_out)
    osRemove(fname_tree)
    if remove_inputfile: osRemove(fname_in)
    # check if the keys (headers) in _seqs correspont to those in seqs
    # differences can occur when non-string headers are used

    # and return
    return (_seqs, _alignment)
Ejemplo n.º 3
0
def clustalw(inputfile="",seqs={},remove_inputfile=True,params={}):
    """
    """
    if inputfile and seqs:
        raise "wrong usage!"
    elif inputfile and not seqs:
        # input is (hopefully) a filename
        pass
    elif not inputfile and seqs:
        # input is (hopefully) sequences
        # do a quick check if (sequence) strings are given
        ARE_ALL_STRINGS = True
        for header, seq in seqs.iteritems():
            if not seq:
                ARE_ALL_STRINGS = False
                break
        if not ARE_ALL_STRINGS:
            raise Exception, "no sequence string(s) specified: %s" % seqs
        # make a kind of semi-unique filename
        uniqueid = get_random_string_tag()
        inputfile = uniqueid+"_"+"_".join([ _nonstringheader2stringheader(hdr) for hdr in seqs.keys()[0:5] ])
        inputfile+=".mfa"
        writeMultiFasta(seqs,inputfile)
    else:
        # no input at all
        raise "no input specified"

    # okay, do the clustalw
    fname_in = inputfile
    # get hard-assigned parameters
    paramstring = " ".join([ "-%s=%s" % (k,v) for k,v in params.iteritems() ]) 
    ci,co = osPopen2("%s %s %s" % (EXECUTABLE_CLUSTALW,fname_in, paramstring))
    ci.close()
    clwout = co.read()
    co.close()
    # abstract output filenames from input filename
    if fname_in.find(".") == -1:
        fname_out  = fname_in+".aln"
        fname_tree = fname_in+".dnd"
    else:
        _base      = fname_in[0:fname_in.rfind(".")]
        fname_out  = _base+".aln"
        fname_tree = _base+".dnd"

    # parse alignment output file
    _seqs,_alignment = _parse_clustalw(fname_out)
    # and delete tmp. created files
    osRemove(fname_out)
    osRemove(fname_tree)
    if remove_inputfile: osRemove(fname_in)
    # check if the keys (headers) in _seqs correspont to those in seqs
    # differences can occur when non-string headers are used

    # and return
    return (_seqs,_alignment)
    def get_cexpander_uniformly_aligned_count(self):
        """ """
        # run cexpander. TODO -> move to one place
        fname = "%s.tmp.cexpander.mfa" % get_random_string_tag()
        fh = open(fname,'w')
        for node,seq in self.getmaxsrproteinsequences().iteritems():
            fh.write( ">%s\n%s\n" % (node,seq))
        fh.close()
        # get cxpdrOutput object; file-cleanup is taken care for
        cxpdrOutput = runcexpander(fname,
                cbalignp_commandline = " -y", output='binary')

        # do cexpander binary string evaluation
        return cxpdrOutput.binarystring.count("1")
Ejemplo n.º 5
0
def blastall_seq2db(header,
                    sequence,
                    dbname="",
                    blastprogram="blastp",
                    output="ncbiparsed",
                    extra_blastp_params={
                        'F': 'F',
                        'e': '10'
                    }):
    """
    """
    if blastprogram not in ['blastp', 'tblastn', 'blastn', 'blastx']:
        raise "only blastp and tblastn are supported"

    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    # generate (semi ;-) unique filename
    uniquetag = get_random_string_tag()
    fname = "_".join(
        [uniquetag,
         str(header).replace(" ", "_"), sequence[0:10] + ".fa"])
    fname = osPathJoin(OSgetcwd(), fname)
    fh = open(fname, 'w')
    fh.write(">%s\n%s\n" % (header, sequence))
    fh.close()
    command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram,
                                            extra_params, fname, dbname)
    try:
        ci, co, ce = osPopen3(command)
        ci.close()
        if output == "ncbiparsed":
            b_parser = NCBIStandalone.BlastParser()
            blastallout = b_parser.parse(co)
        else:
            blastallout = co.read()
        co.close()
        ce.close()
    except:
        # for some kind of - obvious or freak accident case -
        # Blast or parsing of the blast record failed
        # No debugging here; just cleanup and return False
        print "BLAST CRASHED::"
        print command
        blastallout = False

    # remove the created Query file
    osRemove(fname)
    # and return!
    return blastallout
Ejemplo n.º 6
0
    def get_cexpander_uniformly_aligned_count(self):
        """ """
        # run cexpander. TODO -> move to one place
        fname = "%s.tmp.cexpander.mfa" % get_random_string_tag()
        fh = open(fname, 'w')
        for node, seq in self.getmaxsrproteinsequences().iteritems():
            fh.write(">%s\n%s\n" % (node, seq))
        fh.close()
        # get cxpdrOutput object; file-cleanup is taken care for
        cxpdrOutput = runcexpander(fname,
                                   cbalignp_commandline=" -y",
                                   output='binary')

        # do cexpander binary string evaluation
        return cxpdrOutput.binarystring.count("1")
Ejemplo n.º 7
0
 def get_unguided_nt_identity(self):
     """ Get identity% of UNGUIDED DNA alignment """
     # if zerosized -> return 0.0
     if self.length == 0: return 0.0
     # get DNA sequences
     dnaQ,dnaS = self.get_aligned_dna_sequences()
     dnaQ,dnaS = dnaQ.replace("-",""), dnaS.replace("-","")
     # make (semi) unique headers
     uniqueid = get_random_string_tag()
     (qs,qe,ss,se) = self.barcode()[0:4]
     headerQ = "query%s%s%s" % (qs,qe,uniqueid)
     headerS = "sbjct%s%s%s" % (ss,se,uniqueid)
     # prepare & run clustalw
     seqs    = { headerQ: dnaQ, headerS: dnaS }
     out,alignment = clustalw( seqs=seqs )
     # get id% on aligned dna sequences
     cnt = 0
     for pos in range(0,len(out[headerQ])):
         if out[headerQ][pos] == out[headerS][pos]:
             cnt+=1
     # return relative ratio
     return float(cnt) / len(out[headerQ])
Ejemplo n.º 8
0
 def get_unguided_nt_identity(self):
     """ Get identity% of UNGUIDED DNA alignment """
     # if zerosized -> return 0.0
     if self.length == 0: return 0.0
     # get DNA sequences
     dnaQ, dnaS = self.get_aligned_dna_sequences()
     dnaQ, dnaS = dnaQ.replace("-", ""), dnaS.replace("-", "")
     # make (semi) unique headers
     uniqueid = get_random_string_tag()
     (qs, qe, ss, se) = self.barcode()[0:4]
     headerQ = "query%s%s%s" % (qs, qe, uniqueid)
     headerS = "sbjct%s%s%s" % (ss, se, uniqueid)
     # prepare & run clustalw
     seqs = {headerQ: dnaQ, headerS: dnaS}
     out, alignment = clustalw(seqs=seqs)
     # get id% on aligned dna sequences
     cnt = 0
     for pos in range(0, len(out[headerQ])):
         if out[headerQ][pos] == out[headerS][pos]:
             cnt += 1
     # return relative ratio
     return float(cnt) / len(out[headerQ])
Ejemplo n.º 9
0
def blastall_seq2db(header,sequence,dbname="",blastprogram="blastp",output="ncbiparsed",extra_blastp_params={'F': 'F', 'e': '10'}):
    """
    """
    if blastprogram not in ['blastp','tblastn','blastn','blastx']:
        raise "only blastp and tblastn are supported"

    extra_params = " ".join(["-%s %s" % (k,v) for k,v in extra_blastp_params.iteritems()])
    # generate (semi ;-) unique filename
    uniquetag = get_random_string_tag()
    fname = "_".join( [ uniquetag, str(header).replace(" ","_"), sequence[0:10]+".fa" ] )
    fname = osPathJoin(OSgetcwd(),fname)
    fh = open(fname,'w')
    fh.write(">%s\n%s\n" % (header,sequence))
    fh.close()
    command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH,blastprogram,extra_params,fname,dbname)
    try:
        ci,co,ce = osPopen3(command)
        ci.close()
        if output == "ncbiparsed":
            b_parser = NCBIStandalone.BlastParser()
            blastallout = b_parser.parse(co)
        else:
            blastallout = co.read()
        co.close()
        ce.close()
    except:
        # for some kind of - obvious or freak accident case -
        # Blast or parsing of the blast record failed
        # No debugging here; just cleanup and return False
        print "BLAST CRASHED::"
        print command
        blastallout = False

    # remove the created Query file
    osRemove(fname)
    # and return!
    return blastallout
Ejemplo n.º 10
0
def _create_hmm_db(organism,
                   inputdict,
                   cbg,
                   prev,
                   next,
                   orf_must_have_start=False,
                   max_intron_nt_length=200,
                   verbose=False):
    """
    Create fasta ORF database for a organism in a CBG and its viscinity

    @type  organism: * (presumably string)
    @param organism: Organism identifier recognizable in <input data structure>

    @type  inputdict: dict 
    @param inputdict: <input data structure> 

    @type  cbg: CodingBlockGraph or related object
    @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed

    @type  prev: CodingBlockGraph or related object (or None)
    @param prev: CodingBlockGraph upstream/5p of cbg that must be completed

    @type  next: CodingBlockGraph or related object (or None)
    @param next: CodingBlockGraph downstream/3p of cbg that must be completed

    @attention: `prev` and `next` CodingBlockGraphs reduce the search space of
                ORFs to scan with the HMM profile. This Speeds up and
                improves the quality of results.

    @type  orf_must_have_start: Boolean
    @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: positive maximum intron length to take
                                 into acount when selecting suitable ORFs

    @type  verbose: Boolean
    @param verbose: report debugging-report on STDOUT (True) or be quiet (False)
    """

    # fullpath filename of result hmm multi fasta database
    fname_hmm_db_mfa = None
    if not cbg: return fname_hmm_db_mfa

    # (1) try to limit searchspace by prev and next CBG
    prevNode, nextNode = None, None
    prevMin, nextMax = None, None
    maskcoords = []

    # (1a) check if (informant) organism is in the prev CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if prev and organism in prev.organism_set() and\
    prev.has_overall_minimal_spanning_range():
        prevNode = prev.node_by_organism(organism)
        try:
            omsr = prev.overall_minimal_spanning_range(organism=organism)
            prevMin = (max(omsr) + 1) * 3
            maskcoords.append((0, max(omsr)))
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass

    # (1b) check if (informant) organism is in the next CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if next and organism in next.organism_set() and\
    next.has_overall_minimal_spanning_range():
        nextNode = next.node_by_organism(organism)
        try:
            omsr = next.overall_minimal_spanning_range(organism=organism)
            nextMax = min(omsr) * 3
            aaseqlen = len(inputdict[organism]['genomeseq']) / 3
            maskcoords.append((min(omsr), aaseqlen))
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass

    # (1c) limit search space if only prev or next was specified
    if not prev and next and nextMax:
        prevMin = nextMax - max_intron_nt_length
    if not next and prev and prevMin:
        nextMax = prevMin + max_intron_nt_length

    # (2a) get elegiable sets of orfs from prev and next
    if not orf_must_have_start:
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
            min_orf_end=prevMin, max_orf_start=nextMax)
    else:
        # ORFs *must* have starts => searching for a TSS exon/CBG
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
            min_orf_end=prevMin, max_orf_start=nextMax, has_starts=True)

    # (2b) check orf count; can be zero in case of a very tiny region to check
    if not elegiable_orfs: return fname_hmm_db_mfa

    # (3) write masked orfs to fasta database multi line string
    db_fasta = inputdict[organism]['orfs'].tomaskedfasta(
        coords=maskcoords, orflist=elegiable_orfs, header_prefix=organism)
    if orf_must_have_start:
        if len(db_fasta.strip()) == 0:
            # no UNmasked suitable ORFs remaining!
            # This is recognized lateron in this function
            pass
        else:
            # mask out all AAs before the first start
            lines = db_fasta.split("\n")
            for linenr in range(0, len(lines)):
                line = lines[linenr]
                if line[0] != ">":
                    mpos = line.find("M")
                    if mpos > 0:
                        line = "X" * mpos + line[mpos:]
                    lines[linenr] = line
            # recreate db_fasta string
            db_fasta = "\n".join(lines)

    ############################################################################
    if verbose:
        if len(elegiable_orfs) > 10:
            orfidlist = len(elegiable_orfs)
        else:
            orfidlist = [orf.id for orf in elegiable_orfs]
        print "hmm-elegibable orfs:", organism, orfidlist, "/",
        print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin,
        if prev:
            print prev.has_overall_minimal_spanning_range(),
        else:
            print None,
        print "nextMax:", nextMax,
        if next:
            print next.has_overall_minimal_spanning_range()
        else:
            print None
    ############################################################################

    # (4) make unique filename for hmm database file
    fname_base = get_random_string_tag()
    fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base, organism)

    # (5) write masked orfs to fasta database
    fh = open(fname_hmm_db_mfa, 'w')
    fh.write(db_fasta)
    fh.close()

    # (6) make shure that there where orfs written to file;
    # in case very little orfs are selected and all are masked -> no files!
    seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines())
    if not seqs_in_db:
        # delete this (empty) file
        osRemove(fname_hmm_db_mfa)
        return None

    # (7) return hmm search database filename
    return fname_hmm_db_mfa
Ejemplo n.º 11
0
def createblastdbs(input,
                   GSG,
                   OPTIONS,
                   dbfraction=None,
                   organism=None,
                   acceptorfids=[],
                   rejectorfids=[]):
    """
    (Re)create blast-db's by masking the areas thar are incorporated in the GSG

    @type  input: dict
    @param input: `input` data structure dictionary

    @type  GSG: GenestructureOfCodingBlockGraphs
    @param GSG: GenestructureOfCodingBlockGraphs instance

    @type  OPTIONS: optparse options instance
    @param OPTIONS: optparse options instance (with attribute 'abinitio')

    @type  dbfraction: string
    @param dbfraction: None, 'all', 'GSGupstream', 'GSGcentral', 'GSGdownstream', 'annotation'

    @type  organism: organism identifier
    @param organism: only recreate blastdb for this organism/gene identifier

    @type  acceptorfids: list with integers
    @param acceptorfids: list of orf ids to accept

    @type  rejectorfids: list with integers
    @param rejectorfids: list of orf ids to reject

    @attention: acceptorfids and rejectorfids are only used when organism is specified!
    """
    seqsindb = {}
    for org in input.keys():
        # if organism is given, do only this one
        if organism and org != organism: continue
        # acceptorfids anc rejectorfids only valid in combi with `organism`
        if not organism: acceptorfids, rejectorfids = [], []

        # assign blast database name / multi fasta file and open filehandle
        uniquetag = get_random_string_tag()
        fname = '%s-blastdb-%s.fa' % (uniquetag, org)
        fullpath = osPathJoin(OPTIONS.outdir, fname)
        fh = open(fullpath, 'w')
        seqsindb[org] = 0

        # distinct cases possible:
        if len(GSG):
            # there is already a GSG, so this is not the first blast iteration
            # do not apply a shortcut when OPTIONS.abinitio == False
            coords = GSG.omsr2mask(org)
            if dbfraction == 'GSGupstream':
                # take only orfs LEFT of the first CBG in GSG
                max_orf_nt_start = max(
                    GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    max_orf_start=max_orf_nt_start,
                    acceptorfids=acceptorfids,
                    rejectorfids=rejectorfids)
            elif dbfraction == 'GSGdownstream':
                # take only orfs RIGTH of the last CBG in GSG
                min_orf_nt_end = min(
                    GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    min_orf_end=min_orf_nt_end,
                    acceptorfids=acceptorfids,
                    rejectorfids=rejectorfids)
            elif dbfraction == 'GSGcentral':
                # take only orfs in between FIRST and LAST CBG in GSG (can be only one CBG!)
                max_orf_nt_start = max(
                    GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                min_orf_nt_end = min(
                    GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    min_orf_end=min_orf_nt_end,
                    max_orf_start=max_orf_nt_start,
                    acceptorfids=acceptorfids,
                    rejectorfids=rejectorfids)
            else:
                # dbfraction equals 'all' or None -> no limitation, just take all orfs!
                # do only the general limitation on sublists of orfids
                orflist = input[org]['orfs'].get_elegiable_orfs(
                    acceptorfids=acceptorfids, rejectorfids=rejectorfids)

            # create masked fasta of this sequence part only
            newfasta = input[org]['orfs'].tomaskedfasta(coords=coords,
                                                        orflist=orflist,
                                                        header_prefix=org)
            # write to file and count accessions in this file -> seqsindb[org]
            fh.write(newfasta)
            seqsindb[org] = newfasta.count(">")

        else:
            # No filled GSG objects -> no a priori knowledge yet
            # When dbfraction=='annotated' and !OPTIONS.abinitio -> take annotated orfs only
            # TODO: dbfraction is not checked/used here -> just OPTIONS.abinitio
            for orf in input[org]['orfs'].orfs:
                # in case not abinitio, make only a db of orfs in teh current annotation!
                if OPTIONS.abinitio == False and orf.id not in input[org][
                        'orfid-genestructure']:
                    continue
                if orf.id in rejectorfids:
                    # ignore Orfs that are listed as to-be-ignored
                    continue
                if acceptorfids and orf.id not in acceptorfids:
                    # ignore Orfs that are not listed as to-be-accepted
                    continue
                # write fasta of orf to file
                fh.write(
                    orf.tofasta(header="%s_orf_%s" % (org, orf.id)) + "\n")
                # increase seqsindb[org] counter
                seqsindb[org] += 1

        # close the filehandle
        fh.close()
        # run formatdb
        formatdb(fname=fullpath)
        # set name of blastdb in infodict
        input[org]['blastdb'] = fullpath

    # return the counter of how much orf sequences are stored in the blast database
    return seqsindb
Ejemplo n.º 12
0
def createblastdbs(input,GSG,OPTIONS,dbfraction=None,organism=None,acceptorfids=[],rejectorfids=[]):
    """
    (Re)create blast-db's by masking the areas thar are incorporated in the GSG

    @type  input: dict
    @param input: `input` data structure dictionary

    @type  GSG: GenestructureOfCodingBlockGraphs
    @param GSG: GenestructureOfCodingBlockGraphs instance

    @type  OPTIONS: optparse options instance
    @param OPTIONS: optparse options instance (with attribute 'abinitio')

    @type  dbfraction: string
    @param dbfraction: None, 'all', 'GSGupstream', 'GSGcentral', 'GSGdownstream', 'annotation'

    @type  organism: organism identifier
    @param organism: only recreate blastdb for this organism/gene identifier

    @type  acceptorfids: list with integers
    @param acceptorfids: list of orf ids to accept

    @type  rejectorfids: list with integers
    @param rejectorfids: list of orf ids to reject

    @attention: acceptorfids and rejectorfids are only used when organism is specified!
    """
    seqsindb = {}
    for org in input.keys():
        # if organism is given, do only this one
        if organism and org!=organism: continue
        # acceptorfids anc rejectorfids only valid in combi with `organism`
        if not organism: acceptorfids, rejectorfids = [], [] 
  
        # assign blast database name / multi fasta file and open filehandle
        uniquetag = get_random_string_tag()
        fname = '%s-blastdb-%s.fa' % (uniquetag,org)
        fullpath = osPathJoin(OPTIONS.outdir,fname)
        fh = open(fullpath,'w')
        seqsindb[org] = 0

        # distinct cases possible:
        if len(GSG):
            # there is already a GSG, so this is not the first blast iteration
            # do not apply a shortcut when OPTIONS.abinitio == False
            coords = GSG.omsr2mask(org)
            if dbfraction == 'GSGupstream':
                # take only orfs LEFT of the first CBG in GSG
                max_orf_nt_start = max(GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(max_orf_start=max_orf_nt_start,
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)
            elif dbfraction == 'GSGdownstream':
                # take only orfs RIGTH of the last CBG in GSG
                min_orf_nt_end = min(GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(min_orf_end=min_orf_nt_end,
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)
            elif dbfraction == 'GSGcentral':
                # take only orfs in between FIRST and LAST CBG in GSG (can be only one CBG!)
                max_orf_nt_start = max(GSG[-1].overall_minimal_spanning_range(organism=org)) * 3
                min_orf_nt_end   = min(GSG[0].overall_minimal_spanning_range(organism=org)) * 3
                orflist = input[org]['orfs'].get_elegiable_orfs(min_orf_end=min_orf_nt_end,
                        max_orf_start=max_orf_nt_start,
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)
            else:
                # dbfraction equals 'all' or None -> no limitation, just take all orfs!
                # do only the general limitation on sublists of orfids
                orflist = input[org]['orfs'].get_elegiable_orfs(
                        acceptorfids=acceptorfids,rejectorfids=rejectorfids)

            # create masked fasta of this sequence part only
            newfasta = input[org]['orfs'].tomaskedfasta(coords=coords,orflist=orflist,header_prefix=org)
            # write to file and count accessions in this file -> seqsindb[org]
            fh.write(newfasta)
            seqsindb[org] = newfasta.count(">")

        else:
            # No filled GSG objects -> no a priori knowledge yet
            # When dbfraction=='annotated' and !OPTIONS.abinitio -> take annotated orfs only
            # TODO: dbfraction is not checked/used here -> just OPTIONS.abinitio
            for orf in input[org]['orfs'].orfs:
                # in case not abinitio, make only a db of orfs in teh current annotation!
                if OPTIONS.abinitio == False and orf.id not in input[org]['orfid-genestructure']:
                    continue
                if orf.id in rejectorfids:
                    # ignore Orfs that are listed as to-be-ignored
                    continue
                if acceptorfids and orf.id not in acceptorfids:
                    # ignore Orfs that are not listed as to-be-accepted
                    continue
                # write fasta of orf to file
                fh.write(orf.tofasta(header="%s_orf_%s" % (org,orf.id))+"\n")
                # increase seqsindb[org] counter
                seqsindb[org]+=1

        # close the filehandle
        fh.close()
        # run formatdb
        formatdb(fname=fullpath)
        # set name of blastdb in infodict
        input[org]['blastdb'] = fullpath

    # return the counter of how much orf sequences are stored in the blast database
    return seqsindb
Ejemplo n.º 13
0
def _create_hmm_profile(cbg,area="OMSR",prevcbg=None,nextcbg=None,
    strip_nonaligned_residues=False,
    verbose=False,**kwargs):
    """
    """
    # area must be one of 
    # OMSR MINSR MAXSR
    # LEFTSPRDIF RIGTHSPRDIF
    # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF
    # RIGTHORFEND

    # update to default value
    if not kwargs.has_key('sprdif_min_aa_length'):
        kwargs['sprdif_min_aa_length'] = 20

    if area == "OMSR":
        if cbg.has_overall_minimal_spanning_range():
            coords = cbg.overall_minimal_spanning_range()
        else:
            return None, {}
    elif area == "MINSR":
        if cbg.has_minimal_spanning_range():
            coords = cbg.minimal_spanning_range()
        else:
            return None, {}
    elif area == "MAXSR":
        if cbg.has_maximal_spanning_range():
            coords = cbg.maximal_spanning_range()
        else:
            return None, {}
    elif area == "LEFTSPRDIF":
        if cbg.has_left_spanningrange_difference(**kwargs):
            coords = cbg.left_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "RIGTHSPRDIF":
        if cbg.has_rigth_spanningrange_difference(**kwargs):
            coords = cbg.rigth_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "OMSRANDLEFTSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_left_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.left_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords,verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node,coordrange in coords.iteritems():
            coords[node] = Set( range( min(coordrange), max(omsr[node])+1 ) )
    elif area == "OMSRANDRIGTHSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_rigth_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.rigth_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords,verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node,coordrange in coords.iteritems():
            coords[node] = Set( range( min(omsr[node]), max(coordrange)+1 ) )
    elif area == "RIGTHORFEND":
        # area in between MAXSR and orfend
        if not cbg.has_maximal_spanning_range(): return None, {}
        # get coords & obtain Orf ends
        coords = cbg.maximal_spanning_range()
        nodes = coords.keys()
        for node in nodes:
            organism = cbg.organism_by_node(node)
            theorf = cbg.get_orfs_of_graph(organism=organism)[0]
            coords[node] = range(max(coords[node])+1,theorf.protein_endPY)
            # remove zero-length ranges
            if len(coords[node]) == 0: del(coords[node])
    else:
        raise "WHAT ELSE!?"

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # decrease coord range by prevcbg if applicable
    if area in ["MAXSR","LEFTSPRDIF","OMSRANDLEFTSPRDIF"] and prevcbg:
        omsr = prevcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection( prevcbg.organism_set() ):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg  = cbg.get_organism_nodes(org)[0]
            nodePrev = prevcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodePrev): continue
            sta = max( [ max(omsr[nodePrev])+1, min(coords[nodeCbg]) ] )
            end = max(coords[nodeCbg])+1
            coords[nodeCbg] = Set(range(sta,end))
            if not coords[nodeCbg]: del( coords[nodeCbg] )

    # decrease coord range by nextcbg if applicable
    if area in ["MAXSR","RIGTHSPRDIF","OMSRANDRIGTHSPRDIF"] and nextcbg:
        omsr = nextcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection( nextcbg.organism_set() ):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg  = cbg.get_organism_nodes(org)[0]
            nodeNext = nextcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodeNext): continue
            sta = min(coords[nodeCbg])
            end = min( [ min(omsr[nodeNext]), max(coords[nodeCbg])+1 ] )
            coords[nodeCbg] = Set(range(sta,end))
            if not coords[nodeCbg]: del( coords[nodeCbg] )

    # check if coords still present
    if not coords: return None, {}

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # do/redo _remove_short_sprdif_contributors id required
    if area in ["MAXSR","LEFTSPRDIF","RIGTHSPRDIF",
    "OMSRANDLEFTSPRDIF","OMSRANDRIGTHSPRDIF","RIGTHORFEND"]:
        coords = _remove_short_sprdif_contributors(coords)

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # check if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # check sprdif_min_aa_length if applicable
    if area in ["RIGTHSPRDIF","LEFTSPRDIF","OMSRANDRIGTHSPRDIF",
    "OMSRANDLEFTSPRDIF"]:
        maxlength = max([ len(vlist) for vlist in coords.values() ])
        if maxlength < kwargs['sprdif_min_aa_length']:
            return None, {}

    # if here, obtain sequences and build HMM search profile

    # get fasta sequences and 
    fastaseqs = cbg._get_sequences_by_coords(coords)

    # rewrite dict (node) keys to string keys
    fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords)

    # remove empty sequence strings from fastaseqs dict
    empty_seq_keys = []
    for k,seq in fastaseqs.iteritems():
        if seq == "" or len(seq) == 1:
            empty_seq_keys.append(k)
    for k in empty_seq_keys:
        del(coords[k])
        del(fastaseqs[k])

    # check (again) if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # rewrite coords to (min,max) tuple
    coords = dict([ (key,[min(vlist),max(vlist)+1]) for key,vlist in coords.iteritems() ])

    # perform clustalw multiple alignment
    (alignedseqs,alignment) = clustalw( seqs= fastaseqs )


    # strip exterior gaps in case of OMSR/MINSR area
    if area in ["OMSR","MINSR"]:
        alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(
                deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )


    # strip poorly conserved residues in case of RIGTHORFEND
    if area in ["RIGTHORFEND"]:
        alignedseqs,alignment,coords = strip_poorly_supported_tails(
            deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords),0.20 )


    # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID!
    if strip_nonaligned_residues:
        alignedseqs,alignment,coords = strip_overall_nonaligned_residues(
                deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )
        # check if alignment was completely consumed or not
        if not alignment or len(alignment) <= 1:
            return None, {}


    ############################################################################
    if verbose:
        print "## HMM clustalw input profile:",prevcbg!=None,area,nextcbg!=None
        for node,algseq in alignedseqs.iteritems():
            print algseq, node, coords[node]
        print alignment
    ############################################################################

    # make unique filename for hmm profile file
    fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag()

    # write multiple alignment input file
    writeMultiFasta(alignedseqs,fname_hmm_profile)

    # make hmmbuild file of the multiplealignment
    fname_hmmbuild_file = hmmbuild_protein( fname_hmm_profile )

    # remove hmm profile multiple alignment file
    osRemove(fname_hmm_profile)

    # return HMM serach profile filename
    return fname_hmmbuild_file, coords
Ejemplo n.º 14
0
def _create_hmm_db(organism,inputdict,cbg,prev,next,
    orf_must_have_start=False,max_intron_nt_length=200,
    verbose=False):
    """
    Create fasta ORF database for a organism in a CBG and its viscinity

    @type  organism: * (presumably string)
    @param organism: Organism identifier recognizable in <input data structure>

    @type  inputdict: dict 
    @param inputdict: <input data structure> 

    @type  cbg: CodingBlockGraph or related object
    @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed

    @type  prev: CodingBlockGraph or related object (or None)
    @param prev: CodingBlockGraph upstream/5p of cbg that must be completed

    @type  next: CodingBlockGraph or related object (or None)
    @param next: CodingBlockGraph downstream/3p of cbg that must be completed

    @attention: `prev` and `next` CodingBlockGraphs reduce the search space of
                ORFs to scan with the HMM profile. This Speeds up and
                improves the quality of results.

    @type  orf_must_have_start: Boolean
    @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: positive maximum intron length to take
                                 into acount when selecting suitable ORFs

    @type  verbose: Boolean
    @param verbose: report debugging-report on STDOUT (True) or be quiet (False)
    """

    # fullpath filename of result hmm multi fasta database
    fname_hmm_db_mfa = None
    if not cbg: return fname_hmm_db_mfa

    # (1) try to limit searchspace by prev and next CBG
    prevNode, nextNode = None, None
    prevMin,  nextMax  = None, None
    maskcoords = []

    # (1a) check if (informant) organism is in the prev CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if prev and organism in prev.organism_set() and\
    prev.has_overall_minimal_spanning_range():
        prevNode = prev.node_by_organism(organism)
        try:
            omsr = prev.overall_minimal_spanning_range(organism=organism)
            prevMin = (max(omsr)+1)*3
            maskcoords.append( ( 0, max(omsr) ) )
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass


    # (1b) check if (informant) organism is in the next CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if next and organism in next.organism_set() and\
    next.has_overall_minimal_spanning_range():
        nextNode = next.node_by_organism(organism)
        try:
            omsr = next.overall_minimal_spanning_range(organism=organism)
            nextMax = min(omsr)*3
            aaseqlen = len(inputdict[organism]['genomeseq'])/3
            maskcoords.append( ( min(omsr), aaseqlen ) )
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass

    # (1c) limit search space if only prev or next was specified
    if not prev and next and nextMax:
        prevMin = nextMax - max_intron_nt_length
    if not next and prev and prevMin:
        nextMax = prevMin + max_intron_nt_length 

    # (2a) get elegiable sets of orfs from prev and next
    if not orf_must_have_start:
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
                min_orf_end = prevMin, max_orf_start = nextMax
                )
    else:
        # ORFs *must* have starts => searching for a TSS exon/CBG
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
                min_orf_end = prevMin, max_orf_start = nextMax,
                has_starts=True
                )

    # (2b) check orf count; can be zero in case of a very tiny region to check
    if not elegiable_orfs: return fname_hmm_db_mfa

    # (3) write masked orfs to fasta database multi line string
    db_fasta = inputdict[organism]['orfs'].tomaskedfasta(
            coords=maskcoords,
            orflist=elegiable_orfs,
            header_prefix=organism) 
    if orf_must_have_start:
        if len(db_fasta.strip()) == 0:
            # no UNmasked suitable ORFs remaining!
            # This is recognized lateron in this function 
            pass
        else:
            # mask out all AAs before the first start
            lines = db_fasta.split("\n")
            for linenr in range(0,len(lines)):
                line = lines[linenr]
                if line[0] != ">":
                    mpos = line.find("M")
                    if mpos > 0:
                        line = "X"*mpos+line[mpos:]
                    lines[linenr] = line
            # recreate db_fasta string
            db_fasta = "\n".join(lines)

    ############################################################################
    if verbose:
        if len(elegiable_orfs) > 10:
            orfidlist = len(elegiable_orfs)
        else:
            orfidlist = [ orf.id for orf in elegiable_orfs ]
        print "hmm-elegibable orfs:", organism, orfidlist, "/",
        print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin,
        if prev:
            print prev.has_overall_minimal_spanning_range(),
        else:
            print None,
        print "nextMax:", nextMax,
        if next:
            print next.has_overall_minimal_spanning_range()
        else:
            print None
    ############################################################################

    # (4) make unique filename for hmm database file
    fname_base = get_random_string_tag()
    fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base,organism)

    # (5) write masked orfs to fasta database
    fh = open(fname_hmm_db_mfa,'w')
    fh.write( db_fasta )
    fh.close()

    # (6) make shure that there where orfs written to file;
    # in case very little orfs are selected and all are masked -> no files!
    seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines())
    if not seqs_in_db:
        # delete this (empty) file
        osRemove( fname_hmm_db_mfa )
        return None

    # (7) return hmm search database filename
    return fname_hmm_db_mfa
Ejemplo n.º 15
0
def runcexpander(fname_fasta,cbalignp_commandline=" -y",output='binary'):
    """
    Run the complete cascade of cexpander algorithms on an input multi fasta
    file and return the output as a CexpanderOutput object

    @type  fname_fasta: string
    @param fname_fasta: path to input multi fasta file

    @type  cbalignp_commandline: string
    @param cbalignp_commandline: (extra) command line for cbalignp

    @type  min_cols: integer
    @param min_cols: minimal number of uniformly matched positions (cols)
                     required to report transfer blocks for (>= 0)

    @type  projected_on: string
    @param projected_on: apply fasta seqeunce header which to use for projection;
                         apply ':::' to do projections on all input sequences

    @attention: requires global variable EXECUTABLE_cexpander_ALLVSALL
    @attention: requires global variable EXECUTABLE_CEXPANDER_CBALIGNP
    @attention: requires global variable EXECUTABLE_CEXPANDER_CEXPANDER
    @attention: see cexpander_dr for (additional) command line options
    @attention: only a subset of cexpander_dr commandline options are supported!
    
    @rtype:  CexpanderOutput object
    @return: CexpanderOutput object
    """
    if not fname_fasta: raise "NoProperFunctionArguments"
    if not osPathIsfile(fname_fasta): raise "FileDoesNotExist"

    # (0) create (~unique) filenames
    uniquetag = get_random_string_tag() 
    fname_allvsall  = ".".join([fname_fasta,uniquetag,"allvsall"])
    fname_report    = ".".join([fname_fasta,uniquetag,"report"])
    fname_aligned   = ".".join([fname_fasta,uniquetag,"aligned"])
    fname_settings  = ".".join([fname_fasta,uniquetag,"settings"])
    fname_cexpander = ".".join([fname_fasta,uniquetag,"cexpander"])

    # (1) create complete .fa -> cexpanderstring command
    command = """
        python %s %s %s %s;
        %s -i %s %s > %s;
        %s < %s;
        """ % (
        EXECUTABLE_CEXPANDER_ALLVSALL,
        fname_fasta,
        fname_allvsall,
        fname_report,
        EXECUTABLE_CEXPANDER_CBALIGNP,
        fname_allvsall,
        cbalignp_commandline,
        fname_aligned,
        EXECUTABLE_CEXPANDER_CEXPANDER,
        fname_settings,
        )


    # (2) create fname_settings file
    binorfloat = "$dumpcv"
    if output == "float": binorfloat = "$dumpcvc"
    fh = open(fname_settings,'w')
    content = "\n\n".join( [
        "$load\n%s\n%s" % (fname_report,fname_aligned),
        "$addquery\n-1",
        "$run",
        "$dumpentries",
        "$cv_linear",
        "%s" % ( binorfloat ), # BINARY == $dumpcv, FLOAT = $dumpcvc
        "$exit\n\n", 
        ] )
    fh.write(content)
    fh.close()


    # (3) run the command
    ci,co,ce = osPopen3(command)
    ci.close()
    # output of EXECUTABLE_CEXPANDER_ALLVSALL is cast to STDOUT as well!
    cexpanderdata = co.read()
    co.close()
    error = ce.read()
    ce.close()

    # (4) parse fname_cexpander to CexpanderOutput object
    cxpdr = parse_cexpander(cexpanderdata,fname_fasta)

    # (5) cleanup files
    osSystem("rm -f %s %s.%s.*" % ( fname_fasta, fname_fasta,uniquetag ) )

    # (6) return the output object
    return cxpdr
Ejemplo n.º 16
0
def _create_hmm_profile(cbg,
                        area="OMSR",
                        prevcbg=None,
                        nextcbg=None,
                        strip_nonaligned_residues=False,
                        verbose=False,
                        **kwargs):
    """
    """
    # area must be one of
    # OMSR MINSR MAXSR
    # LEFTSPRDIF RIGTHSPRDIF
    # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF
    # RIGTHORFEND

    # update to default value
    if not kwargs.has_key('sprdif_min_aa_length'):
        kwargs['sprdif_min_aa_length'] = 20

    if area == "OMSR":
        if cbg.has_overall_minimal_spanning_range():
            coords = cbg.overall_minimal_spanning_range()
        else:
            return None, {}
    elif area == "MINSR":
        if cbg.has_minimal_spanning_range():
            coords = cbg.minimal_spanning_range()
        else:
            return None, {}
    elif area == "MAXSR":
        if cbg.has_maximal_spanning_range():
            coords = cbg.maximal_spanning_range()
        else:
            return None, {}
    elif area == "LEFTSPRDIF":
        if cbg.has_left_spanningrange_difference(**kwargs):
            coords = cbg.left_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "RIGTHSPRDIF":
        if cbg.has_rigth_spanningrange_difference(**kwargs):
            coords = cbg.rigth_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "OMSRANDLEFTSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_left_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.left_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords, verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node, coordrange in coords.iteritems():
            coords[node] = Set(range(min(coordrange), max(omsr[node]) + 1))
    elif area == "OMSRANDRIGTHSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_rigth_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.rigth_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords, verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node, coordrange in coords.iteritems():
            coords[node] = Set(range(min(omsr[node]), max(coordrange) + 1))
    elif area == "RIGTHORFEND":
        # area in between MAXSR and orfend
        if not cbg.has_maximal_spanning_range(): return None, {}
        # get coords & obtain Orf ends
        coords = cbg.maximal_spanning_range()
        nodes = coords.keys()
        for node in nodes:
            organism = cbg.organism_by_node(node)
            theorf = cbg.get_orfs_of_graph(organism=organism)[0]
            coords[node] = range(max(coords[node]) + 1, theorf.protein_endPY)
            # remove zero-length ranges
            if len(coords[node]) == 0: del (coords[node])
    else:
        raise "WHAT ELSE!?"

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # decrease coord range by prevcbg if applicable
    if area in ["MAXSR", "LEFTSPRDIF", "OMSRANDLEFTSPRDIF"] and prevcbg:
        omsr = prevcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection(prevcbg.organism_set()):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg = cbg.get_organism_nodes(org)[0]
            nodePrev = prevcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodePrev): continue
            sta = max([max(omsr[nodePrev]) + 1, min(coords[nodeCbg])])
            end = max(coords[nodeCbg]) + 1
            coords[nodeCbg] = Set(range(sta, end))
            if not coords[nodeCbg]: del (coords[nodeCbg])

    # decrease coord range by nextcbg if applicable
    if area in ["MAXSR", "RIGTHSPRDIF", "OMSRANDRIGTHSPRDIF"] and nextcbg:
        omsr = nextcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection(nextcbg.organism_set()):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg = cbg.get_organism_nodes(org)[0]
            nodeNext = nextcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodeNext): continue
            sta = min(coords[nodeCbg])
            end = min([min(omsr[nodeNext]), max(coords[nodeCbg]) + 1])
            coords[nodeCbg] = Set(range(sta, end))
            if not coords[nodeCbg]: del (coords[nodeCbg])

    # check if coords still present
    if not coords: return None, {}

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # do/redo _remove_short_sprdif_contributors id required
    if area in [
            "MAXSR", "LEFTSPRDIF", "RIGTHSPRDIF", "OMSRANDLEFTSPRDIF",
            "OMSRANDRIGTHSPRDIF", "RIGTHORFEND"
    ]:
        coords = _remove_short_sprdif_contributors(coords)

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # check if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # check sprdif_min_aa_length if applicable
    if area in [
            "RIGTHSPRDIF", "LEFTSPRDIF", "OMSRANDRIGTHSPRDIF",
            "OMSRANDLEFTSPRDIF"
    ]:
        maxlength = max([len(vlist) for vlist in coords.values()])
        if maxlength < kwargs['sprdif_min_aa_length']:
            return None, {}

    # if here, obtain sequences and build HMM search profile

    # get fasta sequences and
    fastaseqs = cbg._get_sequences_by_coords(coords)

    # rewrite dict (node) keys to string keys
    fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords)

    # remove empty sequence strings from fastaseqs dict
    empty_seq_keys = []
    for k, seq in fastaseqs.iteritems():
        if seq == "" or len(seq) == 1:
            empty_seq_keys.append(k)
    for k in empty_seq_keys:
        del (coords[k])
        del (fastaseqs[k])

    # check (again) if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # rewrite coords to (min,max) tuple
    coords = dict([(key, [min(vlist), max(vlist) + 1])
                   for key, vlist in coords.iteritems()])

    # perform clustalw multiple alignment
    (alignedseqs, alignment) = clustalw(seqs=fastaseqs)

    # strip exterior gaps in case of OMSR/MINSR area
    if area in ["OMSR", "MINSR"]:
        alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))

    # strip poorly conserved residues in case of RIGTHORFEND
    if area in ["RIGTHORFEND"]:
        alignedseqs, alignment, coords = strip_poorly_supported_tails(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords), 0.20)

    # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID!
    if strip_nonaligned_residues:
        alignedseqs, alignment, coords = strip_overall_nonaligned_residues(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))
        # check if alignment was completely consumed or not
        if not alignment or len(alignment) <= 1:
            return None, {}

    ############################################################################
    if verbose:
        print "## HMM clustalw input profile:", prevcbg != None, area, nextcbg != None
        for node, algseq in alignedseqs.iteritems():
            print algseq, node, coords[node]
        print alignment
    ############################################################################

    # make unique filename for hmm profile file
    fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag()

    # write multiple alignment input file
    writeMultiFasta(alignedseqs, fname_hmm_profile)

    # make hmmbuild file of the multiplealignment
    fname_hmmbuild_file = hmmbuild_protein(fname_hmm_profile)

    # remove hmm profile multiple alignment file
    osRemove(fname_hmm_profile)

    # return HMM serach profile filename
    return fname_hmmbuild_file, coords
Ejemplo n.º 17
0
def runcexpander(fname_fasta, cbalignp_commandline=" -y", output='binary'):
    """
    Run the complete cascade of cexpander algorithms on an input multi fasta
    file and return the output as a CexpanderOutput object

    @type  fname_fasta: string
    @param fname_fasta: path to input multi fasta file

    @type  cbalignp_commandline: string
    @param cbalignp_commandline: (extra) command line for cbalignp

    @type  min_cols: integer
    @param min_cols: minimal number of uniformly matched positions (cols)
                     required to report transfer blocks for (>= 0)

    @type  projected_on: string
    @param projected_on: apply fasta seqeunce header which to use for projection;
                         apply ':::' to do projections on all input sequences

    @attention: requires global variable EXECUTABLE_cexpander_ALLVSALL
    @attention: requires global variable EXECUTABLE_CEXPANDER_CBALIGNP
    @attention: requires global variable EXECUTABLE_CEXPANDER_CEXPANDER
    @attention: see cexpander_dr for (additional) command line options
    @attention: only a subset of cexpander_dr commandline options are supported!
    
    @rtype:  CexpanderOutput object
    @return: CexpanderOutput object
    """
    if not fname_fasta: raise "NoProperFunctionArguments"
    if not osPathIsfile(fname_fasta): raise "FileDoesNotExist"

    # (0) create (~unique) filenames
    uniquetag = get_random_string_tag()
    fname_allvsall = ".".join([fname_fasta, uniquetag, "allvsall"])
    fname_report = ".".join([fname_fasta, uniquetag, "report"])
    fname_aligned = ".".join([fname_fasta, uniquetag, "aligned"])
    fname_settings = ".".join([fname_fasta, uniquetag, "settings"])
    fname_cexpander = ".".join([fname_fasta, uniquetag, "cexpander"])

    # (1) create complete .fa -> cexpanderstring command
    command = """
        python %s %s %s %s;
        %s -i %s %s > %s;
        %s < %s;
        """ % (
        EXECUTABLE_CEXPANDER_ALLVSALL,
        fname_fasta,
        fname_allvsall,
        fname_report,
        EXECUTABLE_CEXPANDER_CBALIGNP,
        fname_allvsall,
        cbalignp_commandline,
        fname_aligned,
        EXECUTABLE_CEXPANDER_CEXPANDER,
        fname_settings,
    )

    # (2) create fname_settings file
    binorfloat = "$dumpcv"
    if output == "float": binorfloat = "$dumpcvc"
    fh = open(fname_settings, 'w')
    content = "\n\n".join([
        "$load\n%s\n%s" % (fname_report, fname_aligned),
        "$addquery\n-1",
        "$run",
        "$dumpentries",
        "$cv_linear",
        "%s" % (binorfloat),  # BINARY == $dumpcv, FLOAT = $dumpcvc
        "$exit\n\n",
    ])
    fh.write(content)
    fh.close()

    # (3) run the command
    ci, co, ce = osPopen3(command)
    ci.close()
    # output of EXECUTABLE_CEXPANDER_ALLVSALL is cast to STDOUT as well!
    cexpanderdata = co.read()
    co.close()
    error = ce.read()
    ce.close()

    # (4) parse fname_cexpander to CexpanderOutput object
    cxpdr = parse_cexpander(cexpanderdata, fname_fasta)

    # (5) cleanup files
    osSystem("rm -f %s %s.%s.*" % (fname_fasta, fname_fasta, uniquetag))

    # (6) return the output object
    return cxpdr
Ejemplo n.º 18
0
def blastall_seq2seq(fastadata=(),
                     filenames=(),
                     output="ncbiparsed",
                     blastprogram="blastp",
                     remove_files=True,
                     extra_blastp_params={
                         'F': 'F',
                         'e': '10'
                     }):
    """
    choose proper input:
    fastadata   ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) )
     or
    filenames   ( filenameQUERY, filenameSBJCT )
    """
    input = None

    if blastprogram not in ['blastp', 'tblastn', 'tblastx', 'blastx']:
        raise "only blastp and tblastn are supported"
    elif blastprogram in ['tblastn', 'tblastx']:
        dna_or_prot = "F"
    else:
        dna_or_prot = "T"

    if fastadata and type(fastadata) == type(
        ()) and len(fastadata) == 2 and not filenames:
        # input is fasta headers and sequence
        input = "fastadata"
        # write input filenames
        uniquetag = get_random_string_tag()
        fname_q = "_".join([uniquetag, str(fastadata[0][0]), 'Q.fa'])
        fname_s = "_".join([uniquetag, str(fastadata[1][0]), 'S.fa'])
        fh = open(fname_q, 'w')
        fh.write(">%s\n%s" % (fastadata[0][0], fastadata[0][1]))
        fh.close()
        fh = open(fname_s, 'w')
        fh.write(">%s\n%s" % (fastadata[1][0], fastadata[1][1]))
        fh.close()
    elif filenames and type(filenames) == type(
        ()) and len(filenames) == 2 and not fastadata:
        # input is (supposed to be) filenames
        input = "filenames"
        # get filenames
        fname_q = filenames[0]
        fname_s = filenames[1]
    elif not filenames and not fastadata:
        raise "no input!"
    else:
        raise "inproper input!"

    # formatdb
    OSsystem("%s -i %s -p %s" % (FORMATDB_PATH, fname_s, dna_or_prot))
    # and blastall!
    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    ci, co, ce = osPopen3(
        "%s -p %s %s -i %s -d %s " %
        (BLASTALL_PATH, blastprogram, extra_params, fname_q, fname_s))
    ci.close()
    if output == "ncbiparsed":
        b_parser = NCBIStandalone.BlastParser()
        blastallout = b_parser.parse(co)
    else:
        blastallout = co.read()
    co.close()
    ce.close()
    if remove_files:
        OSsystem("rm %s.*" % fname_s)
        osRemove("%s" % fname_s)
        osRemove("%s" % fname_q)
    # and return!
    return blastallout