Esempio n. 1
0
def blastall_seq2seq(fastadata=(),filenames=(),output="ncbiparsed",blastprogram="blastp",remove_files=True,extra_blastp_params={'F': 'F', 'e': '10'}):
    """
    choose proper input:
    fastadata   ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) )
     or
    filenames   ( filenameQUERY, filenameSBJCT )
    """
    input = None

    if blastprogram not in ['blastp','tblastn','tblastx','blastx']:
        raise "only blastp and tblastn are supported"
    elif blastprogram in ['tblastn','tblastx']:
        dna_or_prot = "F"
    else:
        dna_or_prot = "T"

    if fastadata and type(fastadata) == type(()) and len(fastadata) == 2 and not filenames:
        # input is fasta headers and sequence
        input = "fastadata"
        # write input filenames
        uniquetag = get_random_string_tag()
        fname_q = "_".join( [ uniquetag, str(fastadata[0][0]), 'Q.fa' ] )
        fname_s = "_".join( [ uniquetag, str(fastadata[1][0]), 'S.fa' ] )
        fh = open(fname_q,'w')
        fh.write(">%s\n%s" % (fastadata[0][0],fastadata[0][1]))
        fh.close()
        fh = open(fname_s,'w')
        fh.write(">%s\n%s" % (fastadata[1][0],fastadata[1][1]))
        fh.close()
    elif filenames and type(filenames) == type(()) and len(filenames) == 2 and not fastadata:
        # input is (supposed to be) filenames
        input = "filenames"
        # get filenames
        fname_q = filenames[0]
        fname_s = filenames[1]
    elif not filenames and not fastadata:
        raise "no input!"
    else:
        raise "inproper input!"

    # formatdb
    OSsystem("%s -i %s -p %s" % (FORMATDB_PATH,fname_s,dna_or_prot))
    # and blastall!
    extra_params = " ".join(["-%s %s" % (k,v) for k,v in extra_blastp_params.iteritems()])
    ci,co,ce = osPopen3("%s -p %s %s -i %s -d %s " % (BLASTALL_PATH,blastprogram,extra_params,fname_q,fname_s))
    ci.close()
    if output == "ncbiparsed":
        b_parser = NCBIStandalone.BlastParser()
        blastallout = b_parser.parse(co)
    else:
        blastallout = co.read()
    co.close()
    ce.close()
    if remove_files:
        OSsystem("rm %s.*" % fname_s)
        osRemove("%s" % fname_s)
        osRemove("%s" % fname_q)
    # and return!
    return blastallout
Esempio n. 2
0
    def tearDown(self):

        del self.app
        del self._umlFrame
        for x in range(4):

            try:
                osRemove(f'{HISTORY_FILE_NAME}{x}')
            except (ValueError, Exception):
                pass  # we truly want to ignore
Esempio n. 3
0
def comprimirArchivo(self, archivo):
    print "Comprimiendo %s" % archivo

    try:
        f_in = open(archivo, 'rb')
        f_out = gzipOpen(archivo+'.gz', 'wb')
        f_out.writelines(f_in)
        f_out.close()
        f_in.close()
        osRemove(archivo)
    except:
        pass
    def _restoreBackup(self):

        preferencesFileName: str = Preferences.getPreferencesLocation()
        source: str = f"{preferencesFileName}{TestPreferences.BACKUP_SUFFIX}"
        target: str = preferencesFileName
        if osPath.exists(source):
            try:
                copyfile(source, target)
            except IOError as e:
                self.logger.error(f"Unable to copy file. {e}")

            osRemove(source)
        else:
            osRemove(target)
Esempio n. 5
0
def blastall_seq2db(header,
                    sequence,
                    dbname="",
                    blastprogram="blastp",
                    output="ncbiparsed",
                    extra_blastp_params={
                        'F': 'F',
                        'e': '10'
                    }):
    """
    """
    if blastprogram not in ['blastp', 'tblastn', 'blastn', 'blastx']:
        raise "only blastp and tblastn are supported"

    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    # generate (semi ;-) unique filename
    uniquetag = get_random_string_tag()
    fname = "_".join(
        [uniquetag,
         str(header).replace(" ", "_"), sequence[0:10] + ".fa"])
    fname = osPathJoin(OSgetcwd(), fname)
    fh = open(fname, 'w')
    fh.write(">%s\n%s\n" % (header, sequence))
    fh.close()
    command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram,
                                            extra_params, fname, dbname)
    try:
        ci, co, ce = osPopen3(command)
        ci.close()
        if output == "ncbiparsed":
            b_parser = NCBIStandalone.BlastParser()
            blastallout = b_parser.parse(co)
        else:
            blastallout = co.read()
        co.close()
        ce.close()
    except:
        # for some kind of - obvious or freak accident case -
        # Blast or parsing of the blast record failed
        # No debugging here; just cleanup and return False
        print "BLAST CRASHED::"
        print command
        blastallout = False

    # remove the created Query file
    osRemove(fname)
    # and return!
    return blastallout
Esempio n. 6
0
def _file_cleanup(fnamelist, include_directories=False):
    """ """
    for fname in fnamelist:
        if osPathExists(str(fname)):
            try:
                osRemove(str(fname))
            except OSError:
                if osPathIsdir(str(fname)) and include_directories:
                    osSystem("rm -rf %s" % fname)
            except:
                # failed !? on cluster computing, this
                # could be the case when an identical filename
                # is created/owned by another user.
                # I suspect this might happen for formatdb.log ;-)
                pass
Esempio n. 7
0
def _file_cleanup(fnamelist,include_directories=False):
    """ """
    for fname in fnamelist:
        if osPathExists(str(fname)):
            try:
                osRemove(str(fname))
            except OSError:
                if osPathIsdir(str(fname)) and include_directories:
                    osSystem("rm -rf %s"  % fname)
            except:
                # failed !? on cluster computing, this
                # could be the case when an identical filename
                # is created/owned by another user.
                # I suspect this might happen for formatdb.log ;-)
                pass
Esempio n. 8
0
    def _assertIdenticalFiles(self, baseName: str, generatedFileName: str, failMessage: str, removeTestFile: bool = True) -> None:
        """
        The side effect here is that if the assertion passes then this method removes the generated file

        Args:
            baseName:           The base file name
            generatedFileName:  The generated file name
            failMessage:        The message to display if the files fail comparison
        """

        standardFileName: str = self._getFullyQualifiedPdfPath(f'{baseName}{TestDiagramParent.STANDARD_SUFFIX}{TestConstants.TEST_SUFFIX}')
        status:           int = self._runPdfDiff(baseFileName=generatedFileName, standardFileName=standardFileName)

        self.assertTrue(status == 0, failMessage)

        if removeTestFile is True:
            self.logger.info(f'Removing: {generatedFileName}')
            osRemove(generatedFileName)
Esempio n. 9
0
def delete(user, ui):
    if len(ui) != 2:
        print(">>> USAGE: delete <tablename>")
        return

    try:
        with open(ui[1] + '.csv', 'r') as f:
            if not authenticate(user, ui[1], 'w'):
                print(">>> ERROR: User", user, "does not have access to",
                      ui[1])
                return
            print("This will permanently delete table", ui[1])
            if input('Are you sure you want to proceed? (y/n)>') != 'y':
                return
    except FileNotFoundError:
        print(">>> ERROR: Table", ui[1], "does not exist.")
        return

    osRemove(ui[1] + '.csv')
    print("Table", ui[1], "deleted.")
Esempio n. 10
0
    def testJsonSerialization(self):

        pass
        gState: GameState = GameState()
        gState.playerType = PlayerType.Emeritus
        gState.gameType = GameType.Medium
        gState.starDate = 40501.0

        gState.remainingGameTime = 42.42424242

        gState.currentQuadrantCoordinates = Coordinates(4, 4)
        gState.currentSectorCoordinates = Coordinates(9, 9)

        jsonGState: str = jsonpickle.encode(gState, indent=4)
        self.assertIsNotNone(jsonGState, "Pickling failed")

        self.logger.info(f'json game stats: {jsonGState}')

        file: TextIO = open(TestGameState.TEST_PICKLE_FILENAME, 'w')
        file.write(jsonGState)
        file.close()

        jsonFile: TextIO = open(TestGameState.TEST_PICKLE_FILENAME, 'r')
        jsonStr: str = jsonFile.read()
        self.assertIsNotNone(jsonStr)
        jsonFile.close()

        thawedGameState: GameState = jsonpickle.decode(jsonStr)
        self.assertIsNotNone(thawedGameState, "Did that thaw?")

        self.assertEqual(gState.playerType, thawedGameState.playerType,
                         "Player type did not thaw")
        self.assertEqual(gState.gameType, thawedGameState.gameType,
                         "Game type did not thaw")
        self.assertEqual(gState.starDate, thawedGameState.starDate,
                         "Star date did not thaw")
        self.assertEqual(gState.remainingGameTime,
                         thawedGameState.remainingGameTime,
                         "Remaining game time did not thaw")

        osRemove(TestGameState.TEST_PICKLE_FILENAME)
Esempio n. 11
0
def clustalw(inputfile="", seqs={}, remove_inputfile=True, params={}):
    """
    """
    if inputfile and seqs:
        raise "wrong usage!"
    elif inputfile and not seqs:
        # input is (hopefully) a filename
        pass
    elif not inputfile and seqs:
        # input is (hopefully) sequences
        # do a quick check if (sequence) strings are given
        ARE_ALL_STRINGS = True
        for header, seq in seqs.iteritems():
            if not seq:
                ARE_ALL_STRINGS = False
                break
        if not ARE_ALL_STRINGS:
            raise Exception, "no sequence string(s) specified: %s" % seqs
        # make a kind of semi-unique filename
        uniqueid = get_random_string_tag()
        inputfile = uniqueid + "_" + "_".join(
            [_nonstringheader2stringheader(hdr) for hdr in seqs.keys()[0:5]])
        inputfile += ".mfa"
        writeMultiFasta(seqs, inputfile)
    else:
        # no input at all
        raise "no input specified"

    # okay, do the clustalw
    fname_in = inputfile
    # get hard-assigned parameters
    paramstring = " ".join(["-%s=%s" % (k, v) for k, v in params.iteritems()])
    ci, co = osPopen2("%s %s %s" %
                      (EXECUTABLE_CLUSTALW, fname_in, paramstring))
    ci.close()
    clwout = co.read()
    co.close()
    # abstract output filenames from input filename
    if fname_in.find(".") == -1:
        fname_out = fname_in + ".aln"
        fname_tree = fname_in + ".dnd"
    else:
        _base = fname_in[0:fname_in.rfind(".")]
        fname_out = _base + ".aln"
        fname_tree = _base + ".dnd"

    # parse alignment output file
    _seqs, _alignment = _parse_clustalw(fname_out)
    # and delete tmp. created files
    osRemove(fname_out)
    osRemove(fname_tree)
    if remove_inputfile: osRemove(fname_in)
    # check if the keys (headers) in _seqs correspont to those in seqs
    # differences can occur when non-string headers are used

    # and return
    return (_seqs, _alignment)
Esempio n. 12
0
def blastall_seq2db(header,sequence,dbname="",blastprogram="blastp",output="ncbiparsed",extra_blastp_params={'F': 'F', 'e': '10'}):
    """
    """
    if blastprogram not in ['blastp','tblastn','blastn','blastx']:
        raise "only blastp and tblastn are supported"

    extra_params = " ".join(["-%s %s" % (k,v) for k,v in extra_blastp_params.iteritems()])
    # generate (semi ;-) unique filename
    uniquetag = get_random_string_tag()
    fname = "_".join( [ uniquetag, str(header).replace(" ","_"), sequence[0:10]+".fa" ] )
    fname = osPathJoin(OSgetcwd(),fname)
    fh = open(fname,'w')
    fh.write(">%s\n%s\n" % (header,sequence))
    fh.close()
    command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH,blastprogram,extra_params,fname,dbname)
    try:
        ci,co,ce = osPopen3(command)
        ci.close()
        if output == "ncbiparsed":
            b_parser = NCBIStandalone.BlastParser()
            blastallout = b_parser.parse(co)
        else:
            blastallout = co.read()
        co.close()
        ce.close()
    except:
        # for some kind of - obvious or freak accident case -
        # Blast or parsing of the blast record failed
        # No debugging here; just cleanup and return False
        print "BLAST CRASHED::"
        print command
        blastallout = False

    # remove the created Query file
    osRemove(fname)
    # and return!
    return blastallout
Esempio n. 13
0
def clustalw(inputfile="",seqs={},remove_inputfile=True,params={}):
    """
    """
    if inputfile and seqs:
        raise "wrong usage!"
    elif inputfile and not seqs:
        # input is (hopefully) a filename
        pass
    elif not inputfile and seqs:
        # input is (hopefully) sequences
        # do a quick check if (sequence) strings are given
        ARE_ALL_STRINGS = True
        for header, seq in seqs.iteritems():
            if not seq:
                ARE_ALL_STRINGS = False
                break
        if not ARE_ALL_STRINGS:
            raise Exception, "no sequence string(s) specified: %s" % seqs
        # make a kind of semi-unique filename
        uniqueid = get_random_string_tag()
        inputfile = uniqueid+"_"+"_".join([ _nonstringheader2stringheader(hdr) for hdr in seqs.keys()[0:5] ])
        inputfile+=".mfa"
        writeMultiFasta(seqs,inputfile)
    else:
        # no input at all
        raise "no input specified"

    # okay, do the clustalw
    fname_in = inputfile
    # get hard-assigned parameters
    paramstring = " ".join([ "-%s=%s" % (k,v) for k,v in params.iteritems() ]) 
    ci,co = osPopen2("%s %s %s" % (EXECUTABLE_CLUSTALW,fname_in, paramstring))
    ci.close()
    clwout = co.read()
    co.close()
    # abstract output filenames from input filename
    if fname_in.find(".") == -1:
        fname_out  = fname_in+".aln"
        fname_tree = fname_in+".dnd"
    else:
        _base      = fname_in[0:fname_in.rfind(".")]
        fname_out  = _base+".aln"
        fname_tree = _base+".dnd"

    # parse alignment output file
    _seqs,_alignment = _parse_clustalw(fname_out)
    # and delete tmp. created files
    osRemove(fname_out)
    osRemove(fname_tree)
    if remove_inputfile: osRemove(fname_in)
    # check if the keys (headers) in _seqs correspont to those in seqs
    # differences can occur when non-string headers are used

    # and return
    return (_seqs,_alignment)
Esempio n. 14
0
def _create_hmm_profile(cbg,
                        area="OMSR",
                        prevcbg=None,
                        nextcbg=None,
                        strip_nonaligned_residues=False,
                        verbose=False,
                        **kwargs):
    """
    """
    # area must be one of
    # OMSR MINSR MAXSR
    # LEFTSPRDIF RIGTHSPRDIF
    # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF
    # RIGTHORFEND

    # update to default value
    if not kwargs.has_key('sprdif_min_aa_length'):
        kwargs['sprdif_min_aa_length'] = 20

    if area == "OMSR":
        if cbg.has_overall_minimal_spanning_range():
            coords = cbg.overall_minimal_spanning_range()
        else:
            return None, {}
    elif area == "MINSR":
        if cbg.has_minimal_spanning_range():
            coords = cbg.minimal_spanning_range()
        else:
            return None, {}
    elif area == "MAXSR":
        if cbg.has_maximal_spanning_range():
            coords = cbg.maximal_spanning_range()
        else:
            return None, {}
    elif area == "LEFTSPRDIF":
        if cbg.has_left_spanningrange_difference(**kwargs):
            coords = cbg.left_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "RIGTHSPRDIF":
        if cbg.has_rigth_spanningrange_difference(**kwargs):
            coords = cbg.rigth_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "OMSRANDLEFTSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_left_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.left_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords, verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node, coordrange in coords.iteritems():
            coords[node] = Set(range(min(coordrange), max(omsr[node]) + 1))
    elif area == "OMSRANDRIGTHSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_rigth_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.rigth_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords, verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node, coordrange in coords.iteritems():
            coords[node] = Set(range(min(omsr[node]), max(coordrange) + 1))
    elif area == "RIGTHORFEND":
        # area in between MAXSR and orfend
        if not cbg.has_maximal_spanning_range(): return None, {}
        # get coords & obtain Orf ends
        coords = cbg.maximal_spanning_range()
        nodes = coords.keys()
        for node in nodes:
            organism = cbg.organism_by_node(node)
            theorf = cbg.get_orfs_of_graph(organism=organism)[0]
            coords[node] = range(max(coords[node]) + 1, theorf.protein_endPY)
            # remove zero-length ranges
            if len(coords[node]) == 0: del (coords[node])
    else:
        raise "WHAT ELSE!?"

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # decrease coord range by prevcbg if applicable
    if area in ["MAXSR", "LEFTSPRDIF", "OMSRANDLEFTSPRDIF"] and prevcbg:
        omsr = prevcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection(prevcbg.organism_set()):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg = cbg.get_organism_nodes(org)[0]
            nodePrev = prevcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodePrev): continue
            sta = max([max(omsr[nodePrev]) + 1, min(coords[nodeCbg])])
            end = max(coords[nodeCbg]) + 1
            coords[nodeCbg] = Set(range(sta, end))
            if not coords[nodeCbg]: del (coords[nodeCbg])

    # decrease coord range by nextcbg if applicable
    if area in ["MAXSR", "RIGTHSPRDIF", "OMSRANDRIGTHSPRDIF"] and nextcbg:
        omsr = nextcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection(nextcbg.organism_set()):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg = cbg.get_organism_nodes(org)[0]
            nodeNext = nextcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodeNext): continue
            sta = min(coords[nodeCbg])
            end = min([min(omsr[nodeNext]), max(coords[nodeCbg]) + 1])
            coords[nodeCbg] = Set(range(sta, end))
            if not coords[nodeCbg]: del (coords[nodeCbg])

    # check if coords still present
    if not coords: return None, {}

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # do/redo _remove_short_sprdif_contributors id required
    if area in [
            "MAXSR", "LEFTSPRDIF", "RIGTHSPRDIF", "OMSRANDLEFTSPRDIF",
            "OMSRANDRIGTHSPRDIF", "RIGTHORFEND"
    ]:
        coords = _remove_short_sprdif_contributors(coords)

    ############################################################################
    if verbose:
        print area, sum([(max(v) - min(v))
                         for k, v in coords.iteritems()]), len(coords)
    ############################################################################

    # check if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # check sprdif_min_aa_length if applicable
    if area in [
            "RIGTHSPRDIF", "LEFTSPRDIF", "OMSRANDRIGTHSPRDIF",
            "OMSRANDLEFTSPRDIF"
    ]:
        maxlength = max([len(vlist) for vlist in coords.values()])
        if maxlength < kwargs['sprdif_min_aa_length']:
            return None, {}

    # if here, obtain sequences and build HMM search profile

    # get fasta sequences and
    fastaseqs = cbg._get_sequences_by_coords(coords)

    # rewrite dict (node) keys to string keys
    fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords)

    # remove empty sequence strings from fastaseqs dict
    empty_seq_keys = []
    for k, seq in fastaseqs.iteritems():
        if seq == "" or len(seq) == 1:
            empty_seq_keys.append(k)
    for k in empty_seq_keys:
        del (coords[k])
        del (fastaseqs[k])

    # check (again) if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # rewrite coords to (min,max) tuple
    coords = dict([(key, [min(vlist), max(vlist) + 1])
                   for key, vlist in coords.iteritems()])

    # perform clustalw multiple alignment
    (alignedseqs, alignment) = clustalw(seqs=fastaseqs)

    # strip exterior gaps in case of OMSR/MINSR area
    if area in ["OMSR", "MINSR"]:
        alignedseqs, alignment, coords = strip_alignment_for_exterior_gaps(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))

    # strip poorly conserved residues in case of RIGTHORFEND
    if area in ["RIGTHORFEND"]:
        alignedseqs, alignment, coords = strip_poorly_supported_tails(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords), 0.20)

    # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID!
    if strip_nonaligned_residues:
        alignedseqs, alignment, coords = strip_overall_nonaligned_residues(
            deepcopy(alignedseqs), deepcopy(alignment), deepcopy(coords))
        # check if alignment was completely consumed or not
        if not alignment or len(alignment) <= 1:
            return None, {}

    ############################################################################
    if verbose:
        print "## HMM clustalw input profile:", prevcbg != None, area, nextcbg != None
        for node, algseq in alignedseqs.iteritems():
            print algseq, node, coords[node]
        print alignment
    ############################################################################

    # make unique filename for hmm profile file
    fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag()

    # write multiple alignment input file
    writeMultiFasta(alignedseqs, fname_hmm_profile)

    # make hmmbuild file of the multiplealignment
    fname_hmmbuild_file = hmmbuild_protein(fname_hmm_profile)

    # remove hmm profile multiple alignment file
    osRemove(fname_hmm_profile)

    # return HMM serach profile filename
    return fname_hmmbuild_file, coords
Esempio n. 15
0
def _create_hmm_db(organism,
                   inputdict,
                   cbg,
                   prev,
                   next,
                   orf_must_have_start=False,
                   max_intron_nt_length=200,
                   verbose=False):
    """
    Create fasta ORF database for a organism in a CBG and its viscinity

    @type  organism: * (presumably string)
    @param organism: Organism identifier recognizable in <input data structure>

    @type  inputdict: dict 
    @param inputdict: <input data structure> 

    @type  cbg: CodingBlockGraph or related object
    @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed

    @type  prev: CodingBlockGraph or related object (or None)
    @param prev: CodingBlockGraph upstream/5p of cbg that must be completed

    @type  next: CodingBlockGraph or related object (or None)
    @param next: CodingBlockGraph downstream/3p of cbg that must be completed

    @attention: `prev` and `next` CodingBlockGraphs reduce the search space of
                ORFs to scan with the HMM profile. This Speeds up and
                improves the quality of results.

    @type  orf_must_have_start: Boolean
    @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: positive maximum intron length to take
                                 into acount when selecting suitable ORFs

    @type  verbose: Boolean
    @param verbose: report debugging-report on STDOUT (True) or be quiet (False)
    """

    # fullpath filename of result hmm multi fasta database
    fname_hmm_db_mfa = None
    if not cbg: return fname_hmm_db_mfa

    # (1) try to limit searchspace by prev and next CBG
    prevNode, nextNode = None, None
    prevMin, nextMax = None, None
    maskcoords = []

    # (1a) check if (informant) organism is in the prev CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if prev and organism in prev.organism_set() and\
    prev.has_overall_minimal_spanning_range():
        prevNode = prev.node_by_organism(organism)
        try:
            omsr = prev.overall_minimal_spanning_range(organism=organism)
            prevMin = (max(omsr) + 1) * 3
            maskcoords.append((0, max(omsr)))
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass

    # (1b) check if (informant) organism is in the next CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if next and organism in next.organism_set() and\
    next.has_overall_minimal_spanning_range():
        nextNode = next.node_by_organism(organism)
        try:
            omsr = next.overall_minimal_spanning_range(organism=organism)
            nextMax = min(omsr) * 3
            aaseqlen = len(inputdict[organism]['genomeseq']) / 3
            maskcoords.append((min(omsr), aaseqlen))
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass

    # (1c) limit search space if only prev or next was specified
    if not prev and next and nextMax:
        prevMin = nextMax - max_intron_nt_length
    if not next and prev and prevMin:
        nextMax = prevMin + max_intron_nt_length

    # (2a) get elegiable sets of orfs from prev and next
    if not orf_must_have_start:
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
            min_orf_end=prevMin, max_orf_start=nextMax)
    else:
        # ORFs *must* have starts => searching for a TSS exon/CBG
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
            min_orf_end=prevMin, max_orf_start=nextMax, has_starts=True)

    # (2b) check orf count; can be zero in case of a very tiny region to check
    if not elegiable_orfs: return fname_hmm_db_mfa

    # (3) write masked orfs to fasta database multi line string
    db_fasta = inputdict[organism]['orfs'].tomaskedfasta(
        coords=maskcoords, orflist=elegiable_orfs, header_prefix=organism)
    if orf_must_have_start:
        if len(db_fasta.strip()) == 0:
            # no UNmasked suitable ORFs remaining!
            # This is recognized lateron in this function
            pass
        else:
            # mask out all AAs before the first start
            lines = db_fasta.split("\n")
            for linenr in range(0, len(lines)):
                line = lines[linenr]
                if line[0] != ">":
                    mpos = line.find("M")
                    if mpos > 0:
                        line = "X" * mpos + line[mpos:]
                    lines[linenr] = line
            # recreate db_fasta string
            db_fasta = "\n".join(lines)

    ############################################################################
    if verbose:
        if len(elegiable_orfs) > 10:
            orfidlist = len(elegiable_orfs)
        else:
            orfidlist = [orf.id for orf in elegiable_orfs]
        print "hmm-elegibable orfs:", organism, orfidlist, "/",
        print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin,
        if prev:
            print prev.has_overall_minimal_spanning_range(),
        else:
            print None,
        print "nextMax:", nextMax,
        if next:
            print next.has_overall_minimal_spanning_range()
        else:
            print None
    ############################################################################

    # (4) make unique filename for hmm database file
    fname_base = get_random_string_tag()
    fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base, organism)

    # (5) write masked orfs to fasta database
    fh = open(fname_hmm_db_mfa, 'w')
    fh.write(db_fasta)
    fh.close()

    # (6) make shure that there where orfs written to file;
    # in case very little orfs are selected and all are masked -> no files!
    seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines())
    if not seqs_in_db:
        # delete this (empty) file
        osRemove(fname_hmm_db_mfa)
        return None

    # (7) return hmm search database filename
    return fname_hmm_db_mfa
Esempio n. 16
0
def _create_hmm_db(organism,inputdict,cbg,prev,next,
    orf_must_have_start=False,max_intron_nt_length=200,
    verbose=False):
    """
    Create fasta ORF database for a organism in a CBG and its viscinity

    @type  organism: * (presumably string)
    @param organism: Organism identifier recognizable in <input data structure>

    @type  inputdict: dict 
    @param inputdict: <input data structure> 

    @type  cbg: CodingBlockGraph or related object
    @param cbg: CodingBlockGraph upstream/5p of the cbg that must be completed

    @type  prev: CodingBlockGraph or related object (or None)
    @param prev: CodingBlockGraph upstream/5p of cbg that must be completed

    @type  next: CodingBlockGraph or related object (or None)
    @param next: CodingBlockGraph downstream/3p of cbg that must be completed

    @attention: `prev` and `next` CodingBlockGraphs reduce the search space of
                ORFs to scan with the HMM profile. This Speeds up and
                improves the quality of results.

    @type  orf_must_have_start: Boolean
    @param orf_must_have_start: only allow ORFs with methionines as sbjct ORFs

    @type  max_intron_nt_length: integer
    @param max_intron_nt_length: positive maximum intron length to take
                                 into acount when selecting suitable ORFs

    @type  verbose: Boolean
    @param verbose: report debugging-report on STDOUT (True) or be quiet (False)
    """

    # fullpath filename of result hmm multi fasta database
    fname_hmm_db_mfa = None
    if not cbg: return fname_hmm_db_mfa

    # (1) try to limit searchspace by prev and next CBG
    prevNode, nextNode = None, None
    prevMin,  nextMax  = None, None
    maskcoords = []

    # (1a) check if (informant) organism is in the prev CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if prev and organism in prev.organism_set() and\
    prev.has_overall_minimal_spanning_range():
        prevNode = prev.node_by_organism(organism)
        try:
            omsr = prev.overall_minimal_spanning_range(organism=organism)
            prevMin = (max(omsr)+1)*3
            maskcoords.append( ( 0, max(omsr) ) )
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass


    # (1b) check if (informant) organism is in the next CBG AND if this CBG
    # has an OMSR -> not per se the case!
    if next and organism in next.organism_set() and\
    next.has_overall_minimal_spanning_range():
        nextNode = next.node_by_organism(organism)
        try:
            omsr = next.overall_minimal_spanning_range(organism=organism)
            nextMax = min(omsr)*3
            aaseqlen = len(inputdict[organism]['genomeseq'])/3
            maskcoords.append( ( min(omsr), aaseqlen ) )
        except KeyError:
            # hmmm.... block has an OMSR, but not for this organism!??!!?
            pass

    # (1c) limit search space if only prev or next was specified
    if not prev and next and nextMax:
        prevMin = nextMax - max_intron_nt_length
    if not next and prev and prevMin:
        nextMax = prevMin + max_intron_nt_length 

    # (2a) get elegiable sets of orfs from prev and next
    if not orf_must_have_start:
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
                min_orf_end = prevMin, max_orf_start = nextMax
                )
    else:
        # ORFs *must* have starts => searching for a TSS exon/CBG
        elegiable_orfs = inputdict[organism]['orfs'].get_eligible_orfs(
                min_orf_end = prevMin, max_orf_start = nextMax,
                has_starts=True
                )

    # (2b) check orf count; can be zero in case of a very tiny region to check
    if not elegiable_orfs: return fname_hmm_db_mfa

    # (3) write masked orfs to fasta database multi line string
    db_fasta = inputdict[organism]['orfs'].tomaskedfasta(
            coords=maskcoords,
            orflist=elegiable_orfs,
            header_prefix=organism) 
    if orf_must_have_start:
        if len(db_fasta.strip()) == 0:
            # no UNmasked suitable ORFs remaining!
            # This is recognized lateron in this function 
            pass
        else:
            # mask out all AAs before the first start
            lines = db_fasta.split("\n")
            for linenr in range(0,len(lines)):
                line = lines[linenr]
                if line[0] != ">":
                    mpos = line.find("M")
                    if mpos > 0:
                        line = "X"*mpos+line[mpos:]
                    lines[linenr] = line
            # recreate db_fasta string
            db_fasta = "\n".join(lines)

    ############################################################################
    if verbose:
        if len(elegiable_orfs) > 10:
            orfidlist = len(elegiable_orfs)
        else:
            orfidlist = [ orf.id for orf in elegiable_orfs ]
        print "hmm-elegibable orfs:", organism, orfidlist, "/",
        print len(inputdict[organism]['orfs'].orfs), "prevMin:", prevMin,
        if prev:
            print prev.has_overall_minimal_spanning_range(),
        else:
            print None,
        print "nextMax:", nextMax,
        if next:
            print next.has_overall_minimal_spanning_range()
        else:
            print None
    ############################################################################

    # (4) make unique filename for hmm database file
    fname_base = get_random_string_tag()
    fname_hmm_db_mfa = "hmm_database_%s_%s.fa" % (fname_base,organism)

    # (5) write masked orfs to fasta database
    fh = open(fname_hmm_db_mfa,'w')
    fh.write( db_fasta )
    fh.close()

    # (6) make shure that there where orfs written to file;
    # in case very little orfs are selected and all are masked -> no files!
    seqs_in_db = parseFasta(open(fname_hmm_db_mfa).readlines())
    if not seqs_in_db:
        # delete this (empty) file
        osRemove( fname_hmm_db_mfa )
        return None

    # (7) return hmm search database filename
    return fname_hmm_db_mfa
Esempio n. 17
0
def _create_hmm_profile(cbg,area="OMSR",prevcbg=None,nextcbg=None,
    strip_nonaligned_residues=False,
    verbose=False,**kwargs):
    """
    """
    # area must be one of 
    # OMSR MINSR MAXSR
    # LEFTSPRDIF RIGTHSPRDIF
    # OMSRANDLEFTSPRDIF OMSRANDRIGTHSPRDIF
    # RIGTHORFEND

    # update to default value
    if not kwargs.has_key('sprdif_min_aa_length'):
        kwargs['sprdif_min_aa_length'] = 20

    if area == "OMSR":
        if cbg.has_overall_minimal_spanning_range():
            coords = cbg.overall_minimal_spanning_range()
        else:
            return None, {}
    elif area == "MINSR":
        if cbg.has_minimal_spanning_range():
            coords = cbg.minimal_spanning_range()
        else:
            return None, {}
    elif area == "MAXSR":
        if cbg.has_maximal_spanning_range():
            coords = cbg.maximal_spanning_range()
        else:
            return None, {}
    elif area == "LEFTSPRDIF":
        if cbg.has_left_spanningrange_difference(**kwargs):
            coords = cbg.left_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "RIGTHSPRDIF":
        if cbg.has_rigth_spanningrange_difference(**kwargs):
            coords = cbg.rigth_spanningrange_difference(**kwargs)
        else:
            return None, {}
    elif area == "OMSRANDLEFTSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_left_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.left_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords,verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node,coordrange in coords.iteritems():
            coords[node] = Set( range( min(coordrange), max(omsr[node])+1 ) )
    elif area == "OMSRANDRIGTHSPRDIF":
        kwargs['sprdif_min_aa_length'] = 20
        if not cbg.has_overall_minimal_spanning_range() or\
        not cbg.has_rigth_spanningrange_difference(**kwargs):
            return None, {}
        # if here, start preparing coords
        coords = cbg.rigth_spanningrange_difference(**kwargs)
        # remove short contributors to left SPRDIF
        coords = _remove_short_sprdif_contributors(coords,verbose=verbose)
        # increase coord range by OMSR area
        omsr = cbg.overall_minimal_spanning_range()
        for node,coordrange in coords.iteritems():
            coords[node] = Set( range( min(omsr[node]), max(coordrange)+1 ) )
    elif area == "RIGTHORFEND":
        # area in between MAXSR and orfend
        if not cbg.has_maximal_spanning_range(): return None, {}
        # get coords & obtain Orf ends
        coords = cbg.maximal_spanning_range()
        nodes = coords.keys()
        for node in nodes:
            organism = cbg.organism_by_node(node)
            theorf = cbg.get_orfs_of_graph(organism=organism)[0]
            coords[node] = range(max(coords[node])+1,theorf.protein_endPY)
            # remove zero-length ranges
            if len(coords[node]) == 0: del(coords[node])
    else:
        raise "WHAT ELSE!?"

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # decrease coord range by prevcbg if applicable
    if area in ["MAXSR","LEFTSPRDIF","OMSRANDLEFTSPRDIF"] and prevcbg:
        omsr = prevcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection( prevcbg.organism_set() ):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg  = cbg.get_organism_nodes(org)[0]
            nodePrev = prevcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodePrev): continue
            sta = max( [ max(omsr[nodePrev])+1, min(coords[nodeCbg]) ] )
            end = max(coords[nodeCbg])+1
            coords[nodeCbg] = Set(range(sta,end))
            if not coords[nodeCbg]: del( coords[nodeCbg] )

    # decrease coord range by nextcbg if applicable
    if area in ["MAXSR","RIGTHSPRDIF","OMSRANDRIGTHSPRDIF"] and nextcbg:
        omsr = nextcbg.overall_minimal_spanning_range()
        for org in cbg.organism_set().intersection( nextcbg.organism_set() ):
            # omsr/coords have Node keys -> translate to Organism keys
            nodeCbg  = cbg.get_organism_nodes(org)[0]
            nodeNext = nextcbg.get_organism_nodes(org)[0]
            # check if node not deleted earlier in coords dict
            if not coords.has_key(nodeCbg): continue
            if not omsr.has_key(nodeNext): continue
            sta = min(coords[nodeCbg])
            end = min( [ min(omsr[nodeNext]), max(coords[nodeCbg])+1 ] )
            coords[nodeCbg] = Set(range(sta,end))
            if not coords[nodeCbg]: del( coords[nodeCbg] )

    # check if coords still present
    if not coords: return None, {}

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # do/redo _remove_short_sprdif_contributors id required
    if area in ["MAXSR","LEFTSPRDIF","RIGTHSPRDIF",
    "OMSRANDLEFTSPRDIF","OMSRANDRIGTHSPRDIF","RIGTHORFEND"]:
        coords = _remove_short_sprdif_contributors(coords)

    ############################################################################
    if verbose: print area, sum([(max(v)-min(v)) for k,v in coords.iteritems()]),len(coords)
    ############################################################################

    # check if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # check sprdif_min_aa_length if applicable
    if area in ["RIGTHSPRDIF","LEFTSPRDIF","OMSRANDRIGTHSPRDIF",
    "OMSRANDLEFTSPRDIF"]:
        maxlength = max([ len(vlist) for vlist in coords.values() ])
        if maxlength < kwargs['sprdif_min_aa_length']:
            return None, {}

    # if here, obtain sequences and build HMM search profile

    # get fasta sequences and 
    fastaseqs = cbg._get_sequences_by_coords(coords)

    # rewrite dict (node) keys to string keys
    fastaseqs, coords = _rename_dict_keys_to_strings(fastaseqs, coords)

    # remove empty sequence strings from fastaseqs dict
    empty_seq_keys = []
    for k,seq in fastaseqs.iteritems():
        if seq == "" or len(seq) == 1:
            empty_seq_keys.append(k)
    for k in empty_seq_keys:
        del(coords[k])
        del(fastaseqs[k])

    # check (again) if at least 2 sequences/nodes are remaining
    if len(coords) <= 1: return None, {}

    # rewrite coords to (min,max) tuple
    coords = dict([ (key,[min(vlist),max(vlist)+1]) for key,vlist in coords.iteritems() ])

    # perform clustalw multiple alignment
    (alignedseqs,alignment) = clustalw( seqs= fastaseqs )


    # strip exterior gaps in case of OMSR/MINSR area
    if area in ["OMSR","MINSR"]:
        alignedseqs,alignment,coords = strip_alignment_for_exterior_gaps(
                deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )


    # strip poorly conserved residues in case of RIGTHORFEND
    if area in ["RIGTHORFEND"]:
        alignedseqs,alignment,coords = strip_poorly_supported_tails(
            deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords),0.20 )


    # strip_overall_nonaligned_residues if requested for: THIS IS VERY RIGID!
    if strip_nonaligned_residues:
        alignedseqs,alignment,coords = strip_overall_nonaligned_residues(
                deepcopy(alignedseqs),deepcopy(alignment),deepcopy(coords) )
        # check if alignment was completely consumed or not
        if not alignment or len(alignment) <= 1:
            return None, {}


    ############################################################################
    if verbose:
        print "## HMM clustalw input profile:",prevcbg!=None,area,nextcbg!=None
        for node,algseq in alignedseqs.iteritems():
            print algseq, node, coords[node]
        print alignment
    ############################################################################

    # make unique filename for hmm profile file
    fname_hmm_profile = "hmmbuild_profile_%s.hmmprof" % get_random_string_tag()

    # write multiple alignment input file
    writeMultiFasta(alignedseqs,fname_hmm_profile)

    # make hmmbuild file of the multiplealignment
    fname_hmmbuild_file = hmmbuild_protein( fname_hmm_profile )

    # remove hmm profile multiple alignment file
    osRemove(fname_hmm_profile)

    # return HMM serach profile filename
    return fname_hmmbuild_file, coords
Esempio n. 18
0
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD,pacbporfA,
    orfSetObject,queryorsbjct,verbose = False, **kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  orfSetObject: object with elegiable Orfs
    @param orfSetObject: object with elegiable Orfs

    @type  queryorsbjct: string
    @param queryorsbjct: literal string 'query' or 'sbjct'

    @type  verbose: Boolean
    @param verbose: print debugging info to STDOUT when True

    @rtype:  list
    @return: list with ( intron, ExonOnOrf, intron ) on the query sequence
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs,KWARGS_PROJECTED_TINYEXON)

    MAX_TINYEXON_NT_LENGTH = 33
    MIN_TINYEXON_NT_LENGTH = 6

    tinyexons = []
    if queryorsbjct == "query":
        donorOrf = pacbporfD.orfQ
        accepOrf = pacbporfA.orfQ
        prjctOrf = pacbporfD.orfS
        alignedDonorRange = pacbporfD.alignment_dna_range_query()
        alignedAccepRange = pacbporfA.alignment_dna_range_query()
    elif queryorsbjct == "sbjct":
        donorOrf = pacbporfD.orfS
        accepOrf = pacbporfA.orfS
        prjctOrf = pacbporfD.orfQ
        alignedDonorRange = pacbporfD.alignment_dna_range_sbjct()
        alignedAccepRange = pacbporfA.alignment_dna_range_sbjct()
    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    for dObj in donorOrf._donor_sites:
        # do not make a projection OVER the aligned area
        if dObj.pos < min(alignedDonorRange): continue
        if queryorsbjct == "query":
            (dPos,dPhase) = pacbporfD.dnaposition_query(dObj.pos,forced_return=True)
        else:
            (dPos,dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,forced_return=True)
        try:
            algDobj = pacbporfD._positions[dPos]
        except IndexError:
            # site out of range of PacbPORF -> break
            break
        for aObj in accepOrf._acceptor_sites:
            # do not make a projection OVER the aligned area
            if aObj.pos > max(alignedAccepRange): continue
            if queryorsbjct == "query":
                (aPos,aPhase) = pacbporfA.dnaposition_query(aObj.pos,forced_return=True)
            else:
                (aPos,aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,forced_return=True)
            try:
                algAobj = pacbporfA._positions[aPos]
            except IndexError:
                # site out of range of PacbPORF -> break
                break
            if queryorsbjct == "query":
                posDsbjct = algDobj.sbjct_dna_start + dPhase
                posAsbjct = algAobj.sbjct_dna_start + aPhase
            else:
                posDsbjct = algDobj.query_dna_start + dPhase
                posAsbjct = algAobj.query_dna_start + aPhase
            distance = posAsbjct - posDsbjct
            if distance >= MAX_TINYEXON_NT_LENGTH:
                break
            if distance < MIN_TINYEXON_NT_LENGTH:
                continue

            ####################################################
            # generate a ScanForMatches pattern file
            ####################################################
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct])
            # mask all non-phase0 nucleotides to N residues;
            # this represents the regularexpression for a specific
            # peptide sequence
            firstphasepositions = range( 3-dPhase % 3, len(query), 3)
            for pos in range(0,len(query)):
                if pos not in firstphasepositions:
                    query[pos] = "N"
            # calculate a ~50% mismatch number
            mismatches =  max([ 0, (len(query) - query.count("N"))/2 ])
            # write the pattern to string and subsequently to file
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            if kwargs['allow_non_canonical_donor']:
                sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % (
                    AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO)
            else:
                sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % (
                    AUSO,AUSO,"".join(query),mismatches,DDSO,DDSO)

            ####################################################
            if verbose:
                print (pacbporfD.orfQ.id,pacbporfA.orfQ.id),
                print distance, dObj, aObj
                print sfmpat
            ####################################################

            fname = "sfmpat_tinyexon_%s_%s_%s_%s" % (
                        donorOrf.id,
                        accepOrf.id,
                        posDsbjct,
                        posAsbjct,
                        )
            fh = open(fname,'w')
            fh.write(sfmpat+"\n")
            fh.close()

            ####################################################
            # run ScanForMatches
            ####################################################
            command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\
                      """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\
                      """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\
                      """{ print $1"["$2","$3"]\\n"$4 } }' """
            command = command % (
                        donorOrf.inputgenomicsequence,
                        EXECUTABLE_SFM,fname,
                        dObj.pos+(kwargs['min_intron_nt_length']-3),
                        aObj.pos-(kwargs['min_intron_nt_length']-3) )
            co = osPopen(command)
            matches = parseFasta(co.readlines())
            co.close()

            # filter matches for:
            # (1) correct donor & acceptor phase
            # (2) high enough donor & acceptor site scores
            for hdr,seqmatch in matches.iteritems():
                startQ,stopQ = [ int(item) for item in hdr.split(":")[1][1:-1].split(",") ]
                exonQstart   = startQ + AUSO + 2 - 1
                exonQstop    = stopQ  - DDSO - 2

                ####################################
                # get Orf object of tinyexon
                ####################################
                tinyexonorf = None
                # select the Orf on which the tinyexon is located
                for orfObj in orfSetObject.get_elegiable_orfs(
                max_orf_start=exonQstart,min_orf_end=exonQstop):
                    orfPhase = (exonQstart - orfObj.startPY) % 3
                    if orfPhase == dPhase:               
                        tinyexonorf = orfObj
                        break
                else:
                    # No tinyexonorf assigned!! Iin case a regex matched
                    # over a STOP-codon or the regex length is smaller
                    # then the smallest Orf, no Orf can be assigned
                    continue

                # filter for donor & acceptor score            
                dScore = _score_splice_site(seqmatch[-9:],splicetype='donor')
                aScore = _score_splice_site(seqmatch[0:11],splicetype='acceptor')
                if dScore < kwargs['min_donor_pssm_score']:
                    continue
                if aScore < kwargs['min_acceptor_pssm_score']:
                    continue

                # scan Orf for splicesites
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                        splicetype="donor",
                        min_pssm_score=kwargs['min_donor_pssm_score'],
                        allow_non_canonical=kwargs['allow_non_canonical_donor'],
                        non_canonical_min_pssm_score=kwargs['non_canonical_min_donor_pssm_score'])
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                        splicetype="acceptor",
                        min_pssm_score=kwargs['min_acceptor_pssm_score'],
                        allow_non_canonical=kwargs['allow_non_canonical_acceptor'],
                        non_canonical_min_pssm_score=kwargs['non_canonical_min_acceptor_pssm_score'])

                # get 1th intron donor object
                intron1_aObj = None
                for a in tinyexonorf._acceptor_sites:
                    if a.pos == exonQstart:
                        intron1_aObj = a
                        break
                else:
                    # pseudo-acceptorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue

                # get 2th intron donor object
                intron2_dObj = None
                for d in tinyexonorf._donor_sites:
                    if d.pos == exonQstop:
                        intron2_dObj = d
                        break
                else:
                    # pseudo-donorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue


                # check if introns are of elegiable lengths
                if (intron1_aObj.pos-dObj.pos) > kwargs['max_intron_nt_length']:
                    continue
                if (aObj.pos-intron2_dObj.pos) > kwargs['max_intron_nt_length']:
                    continue

                ####################################################
                if True or verbose:
                    # if here, a candidate!!!
                    print (pacbporfD.orfQ.id,tinyexonorf.id,pacbporfA.orfQ.id),
                    print hdr, dScore, aScore
                    print seqmatch
                ####################################################

                # append to found tinyexons
                query_data      = ( tinyexonorf, exonQstart, exonQstop )
                sbjct_data      = ( prjctOrf, posDsbjct, posAsbjct )
                splicesite_data = ( dObj,intron1_aObj, intron2_dObj, aObj )
                tinyexons.append( ( query_data, sbjct_data, splicesite_data ) )


            # file cleanup
            osRemove(fname)

    # return - End Of Function - if no tinyexons are found
    if not tinyexons:
        return []

    ####################################
    # select the **best** tinyexon
    ####################################
    (query_data,sbjct_data,splicesite_data) = tinyexons[0]
    orfQ,query_dna_start,query_dna_end = query_data
    orfS,sbjct_dna_start,sbjct_dna_end = sbjct_data
    (intron1_dObj,intron1_aObj,intron2_dObj,intron2_aObj) = splicesite_data

    ####################################################
    if verbose:
        print "tinyexon orf:", orfQ
        print "tinyexon orf:", intron1_aObj
        print "tinyexon orf:", intron2_dObj
    ####################################################

    ####################################
    # make tinyexon PacbPORF
    ####################################
    startQaa = orfQ.dnapos2aapos(query_dna_start) -1
    startSaa = orfS.dnapos2aapos(sbjct_dna_start) -1
    stopQaa  = orfQ.dnapos2aapos(query_dna_end) +1
    stopSaa  = orfS.dnapos2aapos(sbjct_dna_end) +1
    # check for directly leading stop codon on tinyexon
    while startQaa <= orfQ.protein_startPY:
        startQaa+=1
        startSaa+=1
        query_dna_start+=3
        sbjct_dna_start+=3
    while startSaa <= orfS.protein_startPY:
        startQaa+=1
        startSaa+=1
        query_dna_start+=3
        sbjct_dna_start+=3
    # check for directly tailing stop codon on tinyexon
    while stopQaa > orfQ.protein_endPY:
        stopQaa-=1
        stopSaa-=1
        query_dna_end-=3
        sbjct_dna_end-=3
    while stopSaa > orfS.protein_endPY:
        stopQaa-=1
        stopSaa-=1
        query_dna_end-=3
        sbjct_dna_end-=3
    # get sequences
    qAAseq = orfQ.getaas(abs_pos_start=startQaa,abs_pos_end=stopQaa)
    sAAseq = orfS.getaas(abs_pos_start=startSaa,abs_pos_end=stopSaa)

    ####################################################
    if verbose or len(qAAseq) != len(sAAseq):
        # if unequal lengths, error will be raised upon PacbP.__init__()
        print orfQ, qAAseq, startQaa, stopQaa, (stopQaa-startQaa),
        print (query_dna_start,query_dna_end)
        print orfS, sAAseq, startSaa, stopSaa, (stopSaa-startSaa),
        print (sbjct_dna_start,sbjct_dna_end)
        print orfQ.inputgenomicsequence[query_dna_start-2:query_dna_end+2]
        print orfS.inputgenomicsequence[sbjct_dna_start-2:sbjct_dna_end+2]
    ####################################################

    # initialize extended tinyexon PacbPORF
    from pacb import PacbP
    pacbp = PacbP(input=( qAAseq, sAAseq, startQaa, startSaa ) )
    pacbp.strip_unmatched_ends()
    pacbporf = pacbp2pacbporf(pacbp,orfQ,orfS)
    pacbporf.extend_pacbporf_after_stops()
    pacbporf.source = 'ABGPprojectingTE'

    ####################################
    # make introns
    ####################################
    intron1 = IntronConnectingOrfs(
                intron1_dObj, intron1_aObj, None,
                donorOrf,pacbporf.orfQ )
    intron2 = IntronConnectingOrfs(
                intron2_dObj, intron2_aObj, None,
                pacbporf.orfQ, accepOrf )


    ################################################################
    # set some meta-data properties to the intron objects
    ################################################################
    # add distance score to intron
    intron1._distance = 0
    intron2._distance = 0

    # add Alignment Positional Periphery Score into objects
    if queryorsbjct == "query":
        succes = set_apps_intron_query(intron1,pacbporfD,pacbporf)
        succes = set_apps_intron_query(intron2,pacbporf,pacbporfA)
    else:
        succes = set_apps_intron_sbjct(intron1,pacbporfD,pacbporf)
        succes = set_apps_intron_sbjct(intron2,pacbporf,pacbporfA)

    # set GFF fsource attribute for recognition of intron sources
    intron1._gff['fsource'] = "ABGPprojectingTE"
    intron2._gff['fsource'] = "ABGPprojectingTE"

    # create _linked_to_xxx attributes
    intron1._linked_to_pacbporfs = [ pacbporf ]
    intron2._linked_to_pacbporfs = [ pacbporf ]
    intron1._linked_to_introns   = [ intron2 ]
    intron2._linked_to_introns   = [ intron1 ]

    ####################################################
    if verbose:
        print pacbporf
        pacbporf.print_protein_and_dna()
        print intron1
        print intron2
        if False:
            # printing data when this function needs to be debugged:
            print ""
            print intron1
            print intron2
            print ""
            print pacbporfD
            pacbporfD.print_protein_and_dna()
            print ""
            print pacbporf
            pacbporf.print_protein_and_dna()
            print ""
            print pacbporfA
            pacbporfA.print_protein_and_dna()
            import sys
            sys.exit()
    ####################################################

    # return introns and intermediate tinyexon PacbPORF
    return [(intron1,intron2,pacbporf)]
    return polygonPolyData

# FFD变换
polygonPolyData = FFD(polygonPolyData)

Mapper = vtk.vtkPolyDataMapper()
Mapper.SetInputData(polygonPolyData)

Actor = vtk.vtkActor()
Actor.SetMapper(Mapper)

Ren1 = vtk.vtkRenderer()
Ren1.AddActor(Actor)


renWin = vtk.vtkRenderWindow()
renWin.AddRenderer(Ren1)

# save the obj data
dir0 = "C:\Users\hxu13\Desktop\pp\ExportData"
porter = vtk.vtkOBJExporter()
porter.SetFilePrefix(dir0+"\cells")
porter.SetInput(renWin)
porter.Write()
osRemove(dir0 + '\cells.mtl')

iren = vtk.vtkRenderWindowInteractor()
iren.SetRenderWindow(renWin)
iren.Initialize()
iren.Start()
Esempio n. 20
0
def _merge_pacbporfs_by_tinyexon_and_two_introns(pacbporfD,
                                                 pacbporfA,
                                                 orfSetObject,
                                                 queryorsbjct,
                                                 verbose=False,
                                                 **kwargs):
    """
    Merge 2 PacbPORF objects by introns

    @attention: see pacb.connecting.merge_orfs_with_intron for **kwargs)

    @type  pacbporfD: PacbPORF object
    @param pacbporfD: PacbPORF object that has to deliver PSSM donor objects

    @type  pacbporfA: PacbPORF object
    @param pacbporfA: PacbPORF object that has to deliver PSSM acceptor objects

    @type  orfSetObject: object with elegiable Orfs
    @param orfSetObject: object with elegiable Orfs

    @type  queryorsbjct: string
    @param queryorsbjct: literal string 'query' or 'sbjct'

    @type  verbose: Boolean
    @param verbose: print debugging info to STDOUT when True

    @rtype:  list
    @return: list with ( intron, ExonOnOrf, intron ) on the query sequence
    """
    # input validation
    IsPacbPORF(pacbporfD)
    IsPacbPORF(pacbporfA)

    # edit **kwargs dictionary for some forced attributes
    _update_kwargs(kwargs, KWARGS_PROJECTED_TINYEXON)

    MAX_TINYEXON_NT_LENGTH = 33
    MIN_TINYEXON_NT_LENGTH = 6

    tinyexons = []
    if queryorsbjct == "query":
        donorOrf = pacbporfD.orfQ
        accepOrf = pacbporfA.orfQ
        prjctOrf = pacbporfD.orfS
        alignedDonorRange = pacbporfD.alignment_dna_range_query()
        alignedAccepRange = pacbporfA.alignment_dna_range_query()
    elif queryorsbjct == "sbjct":
        donorOrf = pacbporfD.orfS
        accepOrf = pacbporfA.orfS
        prjctOrf = pacbporfD.orfQ
        alignedDonorRange = pacbporfD.alignment_dna_range_sbjct()
        alignedAccepRange = pacbporfA.alignment_dna_range_sbjct()
    else:
        message = "'queryorsbjct' (%s), not 'query' or 'sbjct'" % queryorsbjct
        raise InproperlyAppliedArgument, message

    for dObj in donorOrf._donor_sites:
        # do not make a projection OVER the aligned area
        if dObj.pos < min(alignedDonorRange): continue
        if queryorsbjct == "query":
            (dPos, dPhase) = pacbporfD.dnaposition_query(dObj.pos,
                                                         forced_return=True)
        else:
            (dPos, dPhase) = pacbporfD.dnaposition_sbjct(dObj.pos,
                                                         forced_return=True)
        try:
            algDobj = pacbporfD._positions[dPos]
        except IndexError:
            # site out of range of PacbPORF -> break
            break
        for aObj in accepOrf._acceptor_sites:
            # do not make a projection OVER the aligned area
            if aObj.pos > max(alignedAccepRange): continue
            if queryorsbjct == "query":
                (aPos,
                 aPhase) = pacbporfA.dnaposition_query(aObj.pos,
                                                       forced_return=True)
            else:
                (aPos,
                 aPhase) = pacbporfA.dnaposition_sbjct(aObj.pos,
                                                       forced_return=True)
            try:
                algAobj = pacbporfA._positions[aPos]
            except IndexError:
                # site out of range of PacbPORF -> break
                break
            if queryorsbjct == "query":
                posDsbjct = algDobj.sbjct_dna_start + dPhase
                posAsbjct = algAobj.sbjct_dna_start + aPhase
            else:
                posDsbjct = algDobj.query_dna_start + dPhase
                posAsbjct = algAobj.query_dna_start + aPhase
            distance = posAsbjct - posDsbjct
            if distance >= MAX_TINYEXON_NT_LENGTH:
                break
            if distance < MIN_TINYEXON_NT_LENGTH:
                continue

            ####################################################
            # generate a ScanForMatches pattern file
            ####################################################
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            query = list(prjctOrf.inputgenomicsequence[posDsbjct:posAsbjct])
            # mask all non-phase0 nucleotides to N residues;
            # this represents the regularexpression for a specific
            # peptide sequence
            firstphasepositions = range(3 - dPhase % 3, len(query), 3)
            for pos in range(0, len(query)):
                if pos not in firstphasepositions:
                    query[pos] = "N"
            # calculate a ~50% mismatch number
            mismatches = max([0, (len(query) - query.count("N")) / 2])
            # write the pattern to string and subsequently to file
            # example pattern: 6...6 AG NNGNNANNANNGN[2,0,0] GT 3...3
            if kwargs['allow_non_canonical_donor']:
                sfmpat = "%s...%s AG %s[%s,0,0] G (T | C) %s...%s" % (
                    AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO)
            else:
                sfmpat = "%s...%s AG %s[%s,0,0] GT %s...%s" % (
                    AUSO, AUSO, "".join(query), mismatches, DDSO, DDSO)

            ####################################################
            if verbose:
                print(pacbporfD.orfQ.id, pacbporfA.orfQ.id),
                print distance, dObj, aObj
                print sfmpat
            ####################################################

            fname = "sfmpat_tinyexon_%s_%s_%s_%s" % (
                donorOrf.id,
                accepOrf.id,
                posDsbjct,
                posAsbjct,
            )
            fh = open(fname, 'w')
            fh.write(sfmpat + "\n")
            fh.close()

            ####################################################
            # run ScanForMatches
            ####################################################
            command = """echo ">myseq\n%s" | %s %s | tr "[,]" "\t\t#" | """ +\
                      """tr -d "\n " | sed "s/>/\\n>/g" | tr "#" "\t" | """ +\
                      """awk -F'\t' '{ if (NF==4 && $2>%s && $3<%s) """ +\
                      """{ print $1"["$2","$3"]\\n"$4 } }' """
            command = command % (donorOrf.inputgenomicsequence, EXECUTABLE_SFM,
                                 fname, dObj.pos +
                                 (kwargs['min_intron_nt_length'] - 3),
                                 aObj.pos -
                                 (kwargs['min_intron_nt_length'] - 3))
            co = osPopen(command)
            matches = parseFasta(co.readlines())
            co.close()

            # filter matches for:
            # (1) correct donor & acceptor phase
            # (2) high enough donor & acceptor site scores
            for hdr, seqmatch in matches.iteritems():
                startQ, stopQ = [
                    int(item) for item in hdr.split(":")[1][1:-1].split(",")
                ]
                exonQstart = startQ + AUSO + 2 - 1
                exonQstop = stopQ - DDSO - 2

                ####################################
                # get Orf object of tinyexon
                ####################################
                tinyexonorf = None
                # select the Orf on which the tinyexon is located
                for orfObj in orfSetObject.get_eligible_orfs(
                        max_orf_start=exonQstart, min_orf_end=exonQstop):
                    orfPhase = (exonQstart - orfObj.startPY) % 3
                    if orfPhase == dPhase:
                        tinyexonorf = orfObj
                        break
                else:
                    # No tinyexonorf assigned!! Iin case a regex matched
                    # over a STOP-codon or the regex length is smaller
                    # then the smallest Orf, no Orf can be assigned
                    continue

                # filter for donor & acceptor score
                dScore = _score_splice_site(seqmatch[-9:], splicetype='donor')
                aScore = _score_splice_site(seqmatch[0:11],
                                            splicetype='acceptor')
                if dScore < kwargs['min_donor_pssm_score']:
                    continue
                if aScore < kwargs['min_acceptor_pssm_score']:
                    continue

                # scan Orf for splicesites
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                    splicetype="donor",
                    min_pssm_score=kwargs['min_donor_pssm_score'],
                    allow_non_canonical=kwargs['allow_non_canonical_donor'],
                    non_canonical_min_pssm_score=kwargs[
                        'non_canonical_min_donor_pssm_score'])
                tinyexonorf.scan_orf_for_pssm_splice_sites(
                    splicetype="acceptor",
                    min_pssm_score=kwargs['min_acceptor_pssm_score'],
                    allow_non_canonical=kwargs['allow_non_canonical_acceptor'],
                    non_canonical_min_pssm_score=kwargs[
                        'non_canonical_min_acceptor_pssm_score'])

                # get 1th intron donor object
                intron1_aObj = None
                for a in tinyexonorf._acceptor_sites:
                    if a.pos == exonQstart:
                        intron1_aObj = a
                        break
                else:
                    # pseudo-acceptorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue

                # get 2th intron donor object
                intron2_dObj = None
                for d in tinyexonorf._donor_sites:
                    if d.pos == exonQstop:
                        intron2_dObj = d
                        break
                else:
                    # pseudo-donorsite as found be SFM regex
                    # is not a valid acceptor site of high enough score
                    # continue to next iteration of (hdr,seqmatch) pair
                    continue

                # check if introns are of elegiable lengths
                if (intron1_aObj.pos -
                        dObj.pos) > kwargs['max_intron_nt_length']:
                    continue
                if (aObj.pos -
                        intron2_dObj.pos) > kwargs['max_intron_nt_length']:
                    continue

                ####################################################
                if True or verbose:
                    # if here, a candidate!!!
                    print(pacbporfD.orfQ.id, tinyexonorf.id,
                          pacbporfA.orfQ.id),
                    print hdr, dScore, aScore
                    print seqmatch
                ####################################################

                # append to found tinyexons
                query_data = (tinyexonorf, exonQstart, exonQstop)
                sbjct_data = (prjctOrf, posDsbjct, posAsbjct)
                splicesite_data = (dObj, intron1_aObj, intron2_dObj, aObj)
                tinyexons.append((query_data, sbjct_data, splicesite_data))

            # file cleanup
            osRemove(fname)

    # return - End Of Function - if no tinyexons are found
    if not tinyexons:
        return []

    ####################################
    # select the **best** tinyexon
    ####################################
    (query_data, sbjct_data, splicesite_data) = tinyexons[0]
    orfQ, query_dna_start, query_dna_end = query_data
    orfS, sbjct_dna_start, sbjct_dna_end = sbjct_data
    (intron1_dObj, intron1_aObj, intron2_dObj, intron2_aObj) = splicesite_data

    ####################################################
    if verbose:
        print "tinyexon orf:", orfQ
        print "tinyexon orf:", intron1_aObj
        print "tinyexon orf:", intron2_dObj
    ####################################################

    ####################################
    # make tinyexon PacbPORF
    ####################################
    startQaa = orfQ.dnapos2aapos(query_dna_start) - 1
    startSaa = orfS.dnapos2aapos(sbjct_dna_start) - 1
    stopQaa = orfQ.dnapos2aapos(query_dna_end) + 1
    stopSaa = orfS.dnapos2aapos(sbjct_dna_end) + 1
    # check for directly leading stop codon on tinyexon
    while startQaa <= orfQ.protein_startPY:
        startQaa += 1
        startSaa += 1
        query_dna_start += 3
        sbjct_dna_start += 3
    while startSaa <= orfS.protein_startPY:
        startQaa += 1
        startSaa += 1
        query_dna_start += 3
        sbjct_dna_start += 3
    # check for directly tailing stop codon on tinyexon
    while stopQaa > orfQ.protein_endPY:
        stopQaa -= 1
        stopSaa -= 1
        query_dna_end -= 3
        sbjct_dna_end -= 3
    while stopSaa > orfS.protein_endPY:
        stopQaa -= 1
        stopSaa -= 1
        query_dna_end -= 3
        sbjct_dna_end -= 3
    # get sequences
    qAAseq = orfQ.getaas(abs_pos_start=startQaa, abs_pos_end=stopQaa)
    sAAseq = orfS.getaas(abs_pos_start=startSaa, abs_pos_end=stopSaa)

    ####################################################
    if verbose or len(qAAseq) != len(sAAseq):
        # if unequal lengths, error will be raised upon PacbP.__init__()
        print orfQ, qAAseq, startQaa, stopQaa, (stopQaa - startQaa),
        print(query_dna_start, query_dna_end)
        print orfS, sAAseq, startSaa, stopSaa, (stopSaa - startSaa),
        print(sbjct_dna_start, sbjct_dna_end)
        print orfQ.inputgenomicsequence[query_dna_start - 2:query_dna_end + 2]
        print orfS.inputgenomicsequence[sbjct_dna_start - 2:sbjct_dna_end + 2]
    ####################################################

    # initialize extended tinyexon PacbPORF
    from pacb import PacbP
    pacbp = PacbP(input=(qAAseq, sAAseq, startQaa, startSaa))
    pacbp.strip_unmatched_ends()
    pacbporf = pacbp2pacbporf(pacbp, orfQ, orfS)
    pacbporf.extend_pacbporf_after_stops()
    pacbporf.source = 'ABGPprojectingTE'

    ####################################
    # make introns
    ####################################
    intron1 = IntronConnectingOrfs(intron1_dObj, intron1_aObj, None, donorOrf,
                                   pacbporf.orfQ)
    intron2 = IntronConnectingOrfs(intron2_dObj, intron2_aObj, None,
                                   pacbporf.orfQ, accepOrf)

    ################################################################
    # set some meta-data properties to the intron objects
    ################################################################
    # add distance score to intron
    intron1._distance = 0
    intron2._distance = 0

    # add Alignment Positional Periphery Score into objects
    if queryorsbjct == "query":
        succes = set_apps_intron_query(intron1, pacbporfD, pacbporf)
        succes = set_apps_intron_query(intron2, pacbporf, pacbporfA)
    else:
        succes = set_apps_intron_sbjct(intron1, pacbporfD, pacbporf)
        succes = set_apps_intron_sbjct(intron2, pacbporf, pacbporfA)

    # set GFF fsource attribute for recognition of intron sources
    intron1._gff['fsource'] = "ABGPprojectingTE"
    intron2._gff['fsource'] = "ABGPprojectingTE"

    # create _linked_to_xxx attributes
    intron1._linked_to_pacbporfs = [pacbporf]
    intron2._linked_to_pacbporfs = [pacbporf]
    intron1._linked_to_introns = [intron2]
    intron2._linked_to_introns = [intron1]

    ####################################################
    if verbose:
        print pacbporf
        pacbporf.print_protein_and_dna()
        print intron1
        print intron2
        if False:
            # printing data when this function needs to be debugged:
            print ""
            print intron1
            print intron2
            print ""
            print pacbporfD
            pacbporfD.print_protein_and_dna()
            print ""
            print pacbporf
            pacbporf.print_protein_and_dna()
            print ""
            print pacbporfA
            pacbporfA.print_protein_and_dna()
            import sys
            sys.exit()
    ####################################################

    # return introns and intermediate tinyexon PacbPORF
    return [(intron1, intron2, pacbporf)]
Esempio n. 21
0
 def tearDown(self):
     osRemove(TestGMLExporter.UNIT_TEST_FILENAME)
Esempio n. 22
0
 def _cleanupTempFile(self):
     osRemove(ToFastEdit.FAST_EDIT_TEMP_FILE)
Esempio n. 23
0
def blastall_seq2seq(fastadata=(),
                     filenames=(),
                     output="ncbiparsed",
                     blastprogram="blastp",
                     remove_files=True,
                     extra_blastp_params={
                         'F': 'F',
                         'e': '10'
                     }):
    """
    choose proper input:
    fastadata   ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) )
     or
    filenames   ( filenameQUERY, filenameSBJCT )
    """
    input = None

    if blastprogram not in ['blastp', 'tblastn', 'tblastx', 'blastx']:
        raise "only blastp and tblastn are supported"
    elif blastprogram in ['tblastn', 'tblastx']:
        dna_or_prot = "F"
    else:
        dna_or_prot = "T"

    if fastadata and type(fastadata) == type(
        ()) and len(fastadata) == 2 and not filenames:
        # input is fasta headers and sequence
        input = "fastadata"
        # write input filenames
        uniquetag = get_random_string_tag()
        fname_q = "_".join([uniquetag, str(fastadata[0][0]), 'Q.fa'])
        fname_s = "_".join([uniquetag, str(fastadata[1][0]), 'S.fa'])
        fh = open(fname_q, 'w')
        fh.write(">%s\n%s" % (fastadata[0][0], fastadata[0][1]))
        fh.close()
        fh = open(fname_s, 'w')
        fh.write(">%s\n%s" % (fastadata[1][0], fastadata[1][1]))
        fh.close()
    elif filenames and type(filenames) == type(
        ()) and len(filenames) == 2 and not fastadata:
        # input is (supposed to be) filenames
        input = "filenames"
        # get filenames
        fname_q = filenames[0]
        fname_s = filenames[1]
    elif not filenames and not fastadata:
        raise "no input!"
    else:
        raise "inproper input!"

    # formatdb
    OSsystem("%s -i %s -p %s" % (FORMATDB_PATH, fname_s, dna_or_prot))
    # and blastall!
    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    ci, co, ce = osPopen3(
        "%s -p %s %s -i %s -d %s " %
        (BLASTALL_PATH, blastprogram, extra_params, fname_q, fname_s))
    ci.close()
    if output == "ncbiparsed":
        b_parser = NCBIStandalone.BlastParser()
        blastallout = b_parser.parse(co)
    else:
        blastallout = co.read()
    co.close()
    ce.close()
    if remove_files:
        OSsystem("rm %s.*" % fname_s)
        osRemove("%s" % fname_s)
        osRemove("%s" % fname_q)
    # and return!
    return blastallout