コード例 #1
0
 def __init__(self, prot_sequence, monoisotopic=False):
     if prot_sequence.islower():
         self.sequence = Seq(prot_sequence.upper(), IUPAC.protein)
     else:
         self.sequence = Seq(prot_sequence, IUPAC.protein)
     self.amino_acids_content = None
     self.amino_acids_percent = None
     self.length = len(self.sequence)
     self.monoisotopic = monoisotopic
コード例 #2
0
ファイル: AMOS.py プロジェクト: mickn/py_util
def run_minimus(fasta,
                outroot=None,
                restore_singletons=True,
                contig_prefix='',
                qual=None):
    '''given a fasta file and an optional output root (otherwise use fasta base)
	
	generates an assembly using minimus from the amos package
	assembly saved as <outroot>.minimus.fasta
	optionally restores singleton reads in <outroot>.all.fasta
	if restore_singletons=True, returns path to .all.fasta, otherwise returns path to minimus.fasta'''

    if outroot is None:
        outroot = fasta.rsplit('.', 1)[0]

    if qual:
        print >> sys.stderr, 'qualities invoked (%s)' % qual
        os.system('toAmos -s %s -q %s -o %s.afg' % (fasta, qual, outroot))
    else:
        os.system('toAmos -s %s -o %s.afg' % (fasta, outroot))
    os.system('minimus -D TGT=%s.afg %s.minimus' % (outroot, outroot))

    if contig_prefix:
        lines = open(outroot + '.minimus.contig').readlines()
        fh = open(outroot + '.minimus.contig', 'w')
        for l in lines:
            if l.startswith('##'):
                print >> fh, '##' + contig_prefix + l[2:],
            else:
                print >> fh, l,
        fh.close()

    if restore_singletons:
        in_assem = re.findall('#(.+?)\(',
                              open(outroot + '.minimus.contig').read())
        reads = Seq.Fasta(fasta)
        for f in in_assem:
            del reads[f]
        all_fasta = outroot + '.all.fasta'
        assem = Seq.Fasta(outroot + '.minimus.fasta')
        allseq = Seq.Fasta()
        allseq.update(dict([(contig_prefix + k, v) for k, v in assem.items()]))
        allseq.update(reads)
        allseq.write_to_file(all_fasta)
        return all_fasta
    else:
        if contig_prefix:
            f = outroot + '.minimus.fasta'
            lines = open(f).readlines()
            fh = open(f, 'w')
            for l in lines:
                if l.startswith('>'):
                    print >> fh, '>' + contig_prefix + l[1:],
                else:
                    print >> fh, l,
            fh.close()
        return outroot + '.minimus.fasta'
コード例 #3
0
ファイル: run_fasta.py プロジェクト: nhoffman/Seq
def trim_align(seqlist, align_data):
    """Assumes align_data is keyed by seq.getName(). Returns
    a new list of Seq objects, trimmed according to al_start
    and al_stop. Reverse-complements the sequence if necessary
    according to the orientation in the input alignment.
    """

    trimmed_seqs = []
    for seq in seqlist:
        name = seq.getName()

        if not align_data.has_key(name):
            log.info('the sequence %(name)s was not found in the alignment data' % locals())
            continue

        these_results = align_data[name]

        start = these_results['al_start']
        stop = these_results['al_stop']

        substr = seq[start - 1:stop]

        log.debug('name: %(name)s start: %(start)s stop: %(stop)s' % locals())

        if these_results['fa_frame'] == 'r':
            substr = Seq.reverse_complement(substr)
            log.debug('seq %s frame=%s, reverse complementing seq:\n%s' % (name, these_results['fa_frame'], substr))

        newseq = copy.deepcopy(seq)
        newseq.setSeq(substr)
        trimmed_seqs.append(newseq)

    return trimmed_seqs
コード例 #4
0
ファイル: sequtils_test.py プロジェクト: nhoffman/Seq
    def test_find_exec(self):
        cmd = "ls"
        path = Seq.find_exec(cmd)
        self.assertTrue(path)
        if path:
            out = subprocess.call([path], stdout=open(os.devnull, "w"))

        self.assertTrue(out == 0)
コード例 #5
0
ファイル: utils.py プロジェクト: manucorreia/biopython
def ungap(seq):
    """given a sequence with gap encoding, return the ungapped sequence"""
    #TODO - Fix this?  It currently assumes the outmost AlphabetEncoder
    #is for the gap.  Consider HasStopCodon(Gapped(Protein())) as a test case.
    gap = seq.gap_char
    letters = []
    for c in seq.data:
        if c != gap:
            letters.append(c)
    return Seq.Seq("".join(letters), seq.alphabet.alphabet)
コード例 #6
0
def main():
    qseq = 'TATACTTT'
    sseq = Seq.complement(qseq)
    qseq = 'GGACTGACG'
    sseq = 'CCTGGCTGC'

    mono = 50
    diva = 1.5
    oligo = 50
    dntp = 0.25
    seq = Cal(qseq, sseq, mono_conc=50, diva_conc=1.5, oligo_conc=50, dntp_conc=0.25)
    print 'Tm: ', seq.Tm
    print 'DeltaG: ', seq.DeltaG
コード例 #7
0
ファイル: utils.py プロジェクト: manucorreia/biopython
def reduce_sequence(seq, reduction_table, new_alphabet=None):
    """ given an amino-acid sequence, return it in reduced alphabet form based
       on the letter-translation table passed. Some "standard" tables are in
       Alphabet.Reduced.
       seq: a Seq.Seq type sequence
       reduction_table: a dictionary whose keys are the "from" alphabet, and values
       are the "to" alphabet"""
    if new_alphabet is None:
        new_alphabet = Alphabet.single_letter_alphabet
        new_alphabet.letters = ''
        for letter in reduction_table:
            new_alphabet.letters += letter
        new_alphabet.size = len(new_alphabet.letters)
    new_seq = Seq.Seq('', new_alphabet)
    for letter in seq:
        new_seq += reduction_table[letter]
    return new_seq
コード例 #8
0
ファイル: run_clustalw_test.py プロジェクト: nhoffman/Seq
import sys
import os
import unittest
import logging

import config

import Seq

log = logging

module_name = os.path.split(sys.argv[0])[1].rstrip('.py')
outputdir = config.outputdir
datadir = config.datadir

clustalw_path = Seq.find_exec('clustalw')

class TestClustalwInstalled(unittest.TestCase):
    def test1(self):
        if not clustalw_path:
            log.error('clustalw could not be found - skipping tests in this module')

if clustalw_path is not None:
    class TestRunClustalw(unittest.TestCase):

        def setUp(self):
            self.file1 = os.path.join(datadir, '10patients.fasta')
            self.funcname = '_'.join(self.id().split('.')[-2:])
            self.outfile = os.path.join(outputdir,self.funcname)

        def test1(self):
コード例 #9
0
                    csv_writer = writer(write_obj)
                    csv_writer.writerow(rowy)
            return [frame, rowx]


def nothing(val):
    pass


# # #
cv2.namedWindow('image')
cv2.createTrackbar('threshold', 'image', 42, 255, nothing)
vs = VideoCapture.MyVideoCapture()
sp = seqpose.SEQP()
et = EyeTracker(vid=vs, seqp=sp)
s = Seq.SEQ()
while True:
    thresh_val = cv2.getTrackbarPos('threshold', 'image')
    et.pupil_thresh = thresh_val
    # print(pyautogui.position())
    save = win32api.GetAsyncKeyState(0x20)
    # start_time = time.time()
    f = et.mainloop(save)
    if len(f) != 0:
        print(np.shape(f[1]))
        p = s.predict(np.array([f[1]]))
        x = p[0][0] * 1919
        y = p[0][1] * 1079
        pyautogui.moveTo(x, y)
        print("model ", x, y)
        # print("real ",pyautogui.position().x,pyautogui.position().y)
コード例 #10
0
import os


class RfamSearch():
    def __init__(self):
        pass

    def cmscan(self, seq):
        print seq
        # make tmp file
        f = open('/tmp/ss.fa', 'w')
        f.write('>test\n')
        f.write(seq.seq)
        f.close()

        old_pwd = os.getcwd()
        os.chdir('/home/magnus/work/rfamdb')
        cmd = 'cmscan -E 1 Rfam.cm /tmp/ss.fa > /tmp/cmscan.txt'
        subprocess.Popen(cmd, shell=True)
        self.output = open('/tmp/cmscan.txt').read()
        os.chdir(old_pwd)
        return self.output


#main
if __name__ == '__main__':
    import Seq
    seq = Seq.Seq("GGCGCGGCACCGUCCGCGGAACAAACGG")
    rs = RfamSearch()
    rs.cmscan(seq)
コード例 #11
0
ファイル: run_hmmer_test.py プロジェクト: nhoffman/Seq
import sys
import os
import unittest
import logging

import config

import Seq

log = logging

module_name = os.path.split(sys.argv[0])[1].rstrip('.py')
outputdir = config.outputdir
datadir = config.datadir

hmmbuild_path = Seq.find_exec('hmmbuild')

class TestHmmerInstalled(unittest.TestCase):
    def test1(self):
        if hmmbuild_path is None:
            log.error('hmmer software could not be found - skipping tests in this module')

if hmmbuild_path is not None:
    class TestRunHmmer(unittest.TestCase):

        def setUp(self):
            self.file1 = os.path.join(datadir, 's_trimmed.aln')
            self.funcname = '_'.join(self.id().split('.')[-2:])
            self.outfile = os.path.join(outputdir,self.funcname)

        def test1(self):
コード例 #12
0
ファイル: run_fasta_test.py プロジェクト: nhoffman/Seq
import os
import unittest
import logging
import pprint

import config

import Seq

log = logging

module_name = os.path.split(sys.argv[0])[1].rstrip('.py')
outputdir = config.outputdir
datadir = config.datadir

fasta_path = Seq.find_exec('fasta35')

class TestFastaInstalled(unittest.TestCase):
    def test1(self):
        if fasta_path is None:
            log.error('fasta35 could not be found - skipping tests in this module')


if fasta_path is not None:
    class TestRunFasta(unittest.TestCase):

        def setUp(self):
            self.file1 = os.path.join(datadir, '10patients.fasta')
            self.funcname = '_'.join(self.id().split('.')[-2:])
            self.outfile = os.path.join(outputdir,self.funcname)
コード例 #13
0
class ProteinAnalysis(object):
    """Class containing methods for protein analysis.

    The constructor takes two arguments.
    The first is the protein sequence as a string, which is then converted to a
    sequence object using the Bio.Seq module. This is done just to make sure
    the sequence is a protein sequence and not anything else.

    The second argument is optional. If set to True, the weight of the amino
    acids will be calculated using their monoisotopic mass (the weight of the
    most abundant isotopes for each element), instead of the average molecular
    mass (the averaged weight of all stable isotopes for each element).
    If set to false (the default value) or left out, the IUPAC average
    molecular mass will be used for the calculation.

    """
    def __init__(self, prot_sequence, monoisotopic=False):
        if prot_sequence.islower():
            self.sequence = Seq(prot_sequence.upper(), IUPAC.protein)
        else:
            self.sequence = Seq(prot_sequence, IUPAC.protein)
        self.amino_acids_content = None
        self.amino_acids_percent = None
        self.length = len(self.sequence)
        self.monoisotopic = monoisotopic

    def count_amino_acids(self):
        """Count standard amino acids, returns a dict.

        Counts the number times each amino acid is in the protein
        sequence. Returns a dictionary {AminoAcid:Number}.

        The return value is cached in self.amino_acids_content.
        It is not recalculated upon subsequent calls.
        """
        if self.amino_acids_content is None:
            prot_dic = dict((k, 0) for k in IUPACData.protein_letters)
            for aa in prot_dic:
                prot_dic[aa] = self.sequence.count(aa)

            self.amino_acids_content = prot_dic

        return self.amino_acids_content

    def get_amino_acids_percent(self):
        """Calculate the amino acid content in percentages.

        The same as count_amino_acids only returns the Number in percentage of
        entire sequence. Returns a dictionary of {AminoAcid:percentage}.

        The return value is cached in self.amino_acids_percent.

        input is the dictionary self.amino_acids_content.
        output is a dictionary with amino acids as keys.
        """
        if self.amino_acids_percent is None:
            aa_counts = self.count_amino_acids()

            percentages = {}
            for aa in aa_counts:
                percentages[aa] = aa_counts[aa] / float(self.length)

            self.amino_acids_percent = percentages

        return self.amino_acids_percent

    def molecular_weight(self):
        """Calculate MW from Protein sequence"""
        # make local dictionary for speed
        if self.monoisotopic:
            water = 18.01
            iupac_weights = IUPACData.monoisotopic_protein_weights
        else:
            iupac_weights = IUPACData.protein_weights
            water = 18.02

        aa_weights = {}
        for i in iupac_weights:
            # remove a molecule of water from the amino acid weight
            aa_weights[i] = iupac_weights[i] - water

        total_weight = water  # add just one water molecule for the whole sequence
        for aa in self.sequence:
            total_weight += aa_weights[aa]

        return total_weight

    def aromaticity(self):
        """Calculate the aromaticity according to Lobry, 1994.

        Calculates the aromaticity value of a protein according to Lobry, 1994.
        It is simply the relative frequency of Phe+Trp+Tyr.
        """
        aromatic_aas = 'YWF'
        aa_percentages = self.get_amino_acids_percent()

        aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas)

        return aromaticity

    def instability_index(self):
        """Calculate the instability index according to Guruprasad et al 1990.

        Implementation of the method of Guruprasad et al. 1990 to test a
        protein for stability. Any value above 40 means the protein is unstable
        (has a short half life).

        See: Guruprasad K., Reddy B.V.B., Pandit M.W.
        Protein Engineering 4:155-161(1990).
        """
        index = ProtParamData.DIWV
        score = 0.0

        for i in range(self.length - 1):
            this, next = self.sequence[i:i+2]
            dipeptide_value = index[this][next]
            score += dipeptide_value

        return (10.0 / self.length) * score

    def flexibility(self):
        """Calculate the flexibility according to Vihinen, 1994.

        No argument to change window size because parameters are specific for a
        window=9. The parameters used are optimized for determining the flexibility.
        """
        flexibilities = ProtParamData.Flex
        window_size = 9
        weights = [0.25, 0.4375, 0.625, 0.8125, 1]
        scores = []

        for i in range(self.length - window_size):
            subsequence = self.sequence[i:i+window_size]
            score = 0.0

            for j in range(window_size // 2):
                front = subsequence[j]
                back = subsequence[window_size - j - 1]
                score += (flexibilities[front] + flexibilities[back]) * weights[j]

            middle = subsequence[window_size // 2 + 1]
            score += flexibilities[middle]

            scores.append(score / 5.25)

        return scores

    def gravy(self):
        """Calculate the gravy according to Kyte and Doolittle."""
        total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence)

        return total_gravy / self.length

    def _weight_list(self, window, edge):
        """Makes a list of relative weight of the
        window edges compared to the window center. The weights are linear.
        it actually generates half a list. For a window of size 9 and edge 0.4
        you get a list of [0.4, 0.55, 0.7, 0.85].
        """
        unit = 2 * (1.0 - edge) / (window - 1)
        weights = [0.0] * (window // 2)

        for i in range(window // 2):
            weights[i] = edge + unit * i

        return weights

    def protein_scale(self, param_dict, window, edge=1.0):
        """Compute a profile by any amino acid scale.

        An amino acid scale is defined by a numerical value assigned to each type of
        amino acid. The most frequently used scales are the hydrophobicity or
        hydrophilicity scales and the secondary structure conformational parameters
        scales, but many other scales exist which are based on different chemical and
        physical properties of the amino acids.  You can set several parameters that
        control the computation  of a scale profile, such as the window size and the
        window edge relative weight value.

        WindowSize: The window size is the length
        of the interval to use for the profile computation. For a window size n, we
        use the i-(n-1)/2 neighboring residues on each side to compute
        the score for residue i. The score for residue i is the sum of the scaled values
        for these amino acids, optionally weighted according to their position in the
        window.

        Edge: The central amino acid of the window always has a weight of 1.
        By default, the amino acids at the remaining window positions have the same
        weight, but you can make the residue at the center of the window  have a
        larger weight than the others by setting the edge value for the  residues at
        the beginning and end of the interval to a value between 0 and 1. For
        instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7,
        1.0, 0.7, 0.4.

        The method returns a list of values which can be plotted to
        view the change along a protein sequence.  Many scales exist. Just add your
        favorites to the ProtParamData modules.

        Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl
        """
        # generate the weights
        #   _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4]
        #   what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done
        #   in the loop.
        weights = self._weight_list(window, edge)
        scores = []

        # the score in each Window is divided by the sum of weights
        # (* 2 + 1) since the weight list is one sided:
        sum_of_weights = sum(weights) * 2 + 1

        for i in range(self.length - window + 1):
            subsequence = self.sequence[i:i+window]
            score = 0.0

            for j in range(window // 2):
                # walk from the outside of the Window towards the middle.
                # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid
                try:
                    front = param_dict[subsequence[j]]
                    back = param_dict[subsequence[window - j - 1]]
                    score += weights[j] * front + weights[j] * back
                except KeyError:
                    sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' %
                             (subsequence[j], subsequence[window - j - 1]))

            # Now add the middle value, which always has a weight of 1.
            middle = subsequence[window // 2]
            if middle in param_dict:
                score += param_dict[middle]
            else:
                sys.stderr.write('warning: %s  is not a standard amino acid.\n' % (middle))

            scores.append(score / sum_of_weights)

        return scores

    def isoelectric_point(self):
        """Calculate the isoelectric point.

        Uses the module IsoelectricPoint to calculate the pI of a protein.
        """
        aa_content = self.count_amino_acids() #dictionary:  {AA:number present}

        ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
        return ie_point.pi()

    def secondary_structure_fraction(self):
        """Calculate fraction of helix, turn and sheet.

        Returns a list of the fraction of amino acids which tend
        to be in Helix, Turn or Sheet.

        Amino acids in helix: V, I, Y, F, W, L.
        Amino acids in Turn: N, P, G, S.
        Amino acids in sheet: E, M, A, L.

        Returns a tuple of three integers (Helix, Turn, Sheet).
        """
        aa_percentages = self.get_amino_acids_percent()

        helix = sum(aa_percentages[r] for r in 'VIYFWL')
        turn  = sum(aa_percentages[r] for r in 'NPGS')
        sheet = sum(aa_percentages[r] for r in 'EMAL')

        return helix, turn, sheet
コード例 #14
0
ファイル: AMOS.py プロジェクト: mickn/py_util
def build_fasta_from_scaff_gff(infasta_s,
                               gff,
                               contig_prefix='',
                               include_singletons=True,
                               ol_minID=0.9,
                               outfile=None,
                               mum_len='4'):
    '''takes scaffolding information from gff of the form generated by get_scaff_from_minimus
	builds a single assembly for all scaffold instructions pertaining to seqids in infasta
	
	if include_singletons is True, adds all sequences from infasta not included in scaffolds along with the scaffolded sequence in the returned assembly
	
	ol_minID is the minimum %ID accepted for overlaps in contigs
	'''

    if isinstance(infasta_s, str):
        infasta = Seq.Fasta(infasta_s)
    else:
        infasta = deepcopy(infasta_s)

    suffixes = ['', 'b', 'c', 'd', 'e', 'f']
    current_suffix = ''

    #use only scaffolding info relevant to the specified infasta
    in_ids = infasta.seq_names()
    this_gff = [r for r in gff if r['seqid'] in in_ids]

    #use only scaffolding info that joins 2 or more seqs
    contigs = {}.fromkeys([r['attribute_contig'] for r in this_gff], 0)
    for r in this_gff:
        contigs[r['attribute_contig']] += 1

    #get final ordered scaffolding layout
    this_gff = sorted(
        [r for r in this_gff if contigs[r['attribute_contig']] > 1],
        key=lambda r: (r['attribute_contig'], int(r['attribute_cstart']),
                       int(r['attribute_cend'])))

    #extract sequences and orient for scaffolding
    assem_frags = infasta.substr_from_gff(this_gff,
                                          plus_strand=True,
                                          name_key=None)

    assem = Seq.Fasta()
    for k, v in contigs.items():
        if v > 1:
            assem[contig_prefix + k] = Seq.Sequence('')

    if this_gff:
        for i, r in enumerate(this_gff[:-1]):
            next = this_gff[i + 1]
            if r['attribute_contig'] == next['attribute_contig']:
                s1, e1, s2, e2 = [
                    int(n) for n in [
                        r['attribute_cstart'], r['attribute_cend'],
                        next['attribute_cstart'], next['attribute_cend']
                    ]
                ]
                print >> sys.stderr, 'scaffolding %s %s %s %s:\n\t%s\n\t%s' % (
                    s1, e1, s2, e2, r, next)

                if e1 > s2:
                    print >> sys.stderr, 'OVERLAP:\n\t%s\n\t%s' % (
                        assem_frags[r['seqid']][(s2 - s1):],
                        assem_frags[next['seqid']][:(e1 - s2 + 1)])
                    fa1 = Seq.Fasta()
                    fa2 = Seq.Fasta()
                    fa1['seq1'] = assem_frags[r['seqid']][(s2 - s1):]
                    fa2['seq2'] = assem_frags[next['seqid']][:(e1 - s2 + 1)]
                    shorter = min(len(fa1['seq1']), len(fa2['seq2']))
                    mums = Aln.mum(fa1,
                                   fa2,
                                   mumargs={'-l': '%s' % int(mum_len)})[0]
                    match = float(sum([mumr['score'] for mumr in mums]))
                    if (shorter <= 2*int(mum_len) + math.ceil((1-ol_minID)*shorter)) or \
                      (match/shorter >= ol_minID) or \
                      (fa1['seq1'][:shorter] == fa2['seq2'][:shorter]) or \
                      (Seq.is_simple(fa1['seq1']) or Seq.is_simple(fa2['seq2'])):
                        assem[contig_prefix + r['attribute_contig'] +
                              current_suffix] += assem_frags[r['seqid']][:s2]
                    else:
                        #implement record of splitting into a/b/etc fragments!
                        print >> sys.stdout, fa1, '\n', fa2, '\n', mums
                        current_suffix = suffixes[
                            suffixes.index(current_suffix) + 1]
                        print >> sys.stderr, 'overlap of %s bp %0.2f %%id unresolved (min %0.2f)\nstarting %s' % (
                            e1 - s2, match /
                            (e1 - s2), ol_minID, current_suffix)
                        assem[contig_prefix + r['attribute_contig'] +
                              current_suffix] = assem_frags[r['seqid']]
                else:
                    spacer = Seq.Sequence('n' * (s2 - e1))
                    assem[contig_prefix + r['attribute_contig'] +
                          current_suffix] += assem_frags[r['seqid']] + spacer
            else:
                assem[contig_prefix + r['attribute_contig'] +
                      current_suffix] += assem_frags[r['seqid']]
                current_suffix = ''

        assem[contig_prefix + this_gff[-1]['attribute_contig'] +
              current_suffix] += assem_frags[this_gff[-1]['seqid']]

    if include_singletons:
        singletons = dict([(k, v) for k, v in infasta.items()
                           if not k in [r['seqid'] for r in this_gff]])
        assem.update(singletons)

    if outfile:
        assem.write_to_file(outfile)

    return assem
コード例 #15
0
ファイル: test_Seq.py プロジェクト: sunlei0227/IPyRSSA
#########
#########   Test Seq.py
#########

sys.path.append("/Share/home/zhangqf8/lipan/python_utils/PyPsBL")

import Seq

#####################
#  reverse_comp(sequence)
#####################

print Seq.reverse_comp("TAGCTAGCTGGTTAGTTCTATC")
print Seq.reverse_comp("TAGCTAatgcatTAGTTCTATC")
print Seq.reverse_comp("TAGCTAGCT---TAGTTC--TC")
print Seq.reverse_comp("TANNNNNNNNGTTAGTTCTATC")

#####################
#  flat_seq(sequence, lineLen=60)
#####################
import General
seqFn = "test_seq.fasta"
fasta = General.load_fasta(seqFn, rem_tVersion=False)

print Seq.flat_seq(fasta['ENST00000580210.5'])
print Seq.flat_seq(fasta['ENST00000580210.5'], lineLen=10)
print Seq.flat_seq(fasta['ENST00000580210.5'], lineLen=100)
print Seq.flat_seq("ACAGATTGTT")

#####################
#  format_gene_type(gene_type)
コード例 #16
0
ファイル: TestSeq.py プロジェクト: malikfassi/Algo
	def aTest(self,filename,expectedValue):
		S = Seq()
		S.chargerGrille(filename)
		S.trouvePuits()
		self.assertEqual(S.solutionExiste(),expectedValue)
コード例 #17
0
ファイル: AMOS.py プロジェクト: brantp/py_util
def build_fasta_from_scaff_gff(infasta_s,gff,contig_prefix='',include_singletons=True,ol_minID=0.9,outfile=None,mum_len='4'):
	'''takes scaffolding information from gff of the form generated by get_scaff_from_minimus
	builds a single assembly for all scaffold instructions pertaining to seqids in infasta
	
	if include_singletons is True, adds all sequences from infasta not included in scaffolds along with the scaffolded sequence in the returned assembly
	
	ol_minID is the minimum %ID accepted for overlaps in contigs
	'''
	
	if isinstance(infasta_s,str):
		infasta = Seq.Fasta(infasta_s)
	else:
		infasta = deepcopy(infasta_s)
	
	suffixes = ['','b','c','d','e','f']
	current_suffix = ''
	
	#use only scaffolding info relevant to the specified infasta
	in_ids = infasta.seq_names()
	this_gff = [r for r in gff if r['seqid'] in in_ids]
	
	#use only scaffolding info that joins 2 or more seqs
	contigs = {}.fromkeys([r['attribute_contig'] for r in this_gff],0)
	for r in this_gff:
		contigs[r['attribute_contig']] += 1

	#get final ordered scaffolding layout
	this_gff = sorted([r for r in this_gff if contigs[r['attribute_contig']] > 1],
					key=lambda r: (r['attribute_contig'],int(r['attribute_cstart']),int(r['attribute_cend'])))
	
	#extract sequences and orient for scaffolding
	assem_frags = infasta.substr_from_gff(this_gff,plus_strand=True,name_key=None)

	assem = Seq.Fasta()
	for k,v in contigs.items():
		if v>1:
			assem[contig_prefix+k] = Seq.Sequence('')

	if this_gff:
		for i,r in enumerate(this_gff[:-1]):
			next = this_gff[i+1]
			if r['attribute_contig'] == next['attribute_contig']:
				s1,e1,s2,e2 = [int(n) for n in [r['attribute_cstart'],r['attribute_cend'],next['attribute_cstart'],next['attribute_cend']]]
				print >> sys.stderr, 'scaffolding %s %s %s %s:\n\t%s\n\t%s' % (s1,e1,s2,e2,r,next)
			
				if e1 > s2:
					print >> sys.stderr, 'OVERLAP:\n\t%s\n\t%s' % (assem_frags[r['seqid']][(s2-s1):],assem_frags[next['seqid']][:(e1-s2+1)])
					fa1 = Seq.Fasta()
					fa2 = Seq.Fasta()
					fa1['seq1'] = assem_frags[r['seqid']][(s2-s1):]
					fa2['seq2'] = assem_frags[next['seqid']][:(e1-s2+1)]
					shorter = min(len(fa1['seq1']),len(fa2['seq2']))
					mums = Aln.mum(fa1,fa2,mumargs={'-l':'%s' % int(mum_len)})[0]
					match = float(sum([mumr['score'] for mumr in mums]))
					if (shorter <= 2*int(mum_len) + math.ceil((1-ol_minID)*shorter)) or \
							(match/shorter >= ol_minID) or \
							(fa1['seq1'][:shorter] == fa2['seq2'][:shorter]) or \
							(Seq.is_simple(fa1['seq1']) or Seq.is_simple(fa2['seq2'])):
						assem[contig_prefix+r['attribute_contig']+current_suffix] += assem_frags[r['seqid']][:s2]
					else:
						#implement record of splitting into a/b/etc fragments!
						print >> sys.stdout,fa1,'\n',fa2,'\n',mums
						current_suffix = suffixes[suffixes.index(current_suffix)+1]
						print >>sys.stderr, 'overlap of %s bp %0.2f %%id unresolved (min %0.2f)\nstarting %s' % (e1-s2,match/(e1-s2),ol_minID,current_suffix)
						assem[contig_prefix+r['attribute_contig']+current_suffix] = assem_frags[r['seqid']]
				else:
					spacer = Seq.Sequence('n'*(s2-e1))
					assem[contig_prefix+r['attribute_contig']+current_suffix] += assem_frags[r['seqid']] + spacer
			else:
				assem[contig_prefix+r['attribute_contig']+current_suffix] += assem_frags[r['seqid']]
				current_suffix = ''
	
		assem[contig_prefix+this_gff[-1]['attribute_contig']+current_suffix] += assem_frags[this_gff[-1]['seqid']]
	
	if include_singletons:
		singletons = dict([(k,v) for k,v in infasta.items() if not k in [r['seqid'] for r in this_gff]])
		assem.update(singletons)

	if outfile:
		assem.write_to_file(outfile)
		
	return assem
コード例 #18
0
ファイル: GL.py プロジェクト: Frankmusegit/MyPythonCode
#!/usr/bin/env python                                                                            
#-*- coding:utf-8 -*-
from PyQt4.QtGui import *
from PyQt4.QtCore import *
from Seq import *

PDPI = 0.0
DPI = 0.0
DPMM = 0.0
symbols = dict()
icons = dict()
shortcuts = dict()
defaultTextStyles = []
docName = QString()
dataPath = QString()
mscoreGlobalShare = QString("share\\")
mscore = 0
gscore = 0
seq = Seq()
recentScores = QStringList()
revision = QString()
instrumentGroups = list()
articulation = list()
actions = dict()


コード例 #19
0
ファイル: run_fasta.py プロジェクト: nhoffman/Seq
def run(query, target, e_val=10, outfile=None,
    fastapath=None, format=10, cleanup=True):

    """Returns a dict keyed by (seqname1,seqname2) pairs containing
    alignment data.

    * query - Seq object or filename of fasta format sequences
    * target - list of Seq objects or filename of fasta format sequences
    * e_val - (see FASTA3* documentation)
    * outfile - name of output file
    * outdir - directory to write fasta output
    * fastapath - name of directory containing fasta3* executable
    * format - (see FASTA3* documentation)
    * cleanup - if True, delete fasta output file
    """

    # see http://helix.nih.gov/apps/bioinfo/fasta3x.txt
    # format = 10 for machine-readable alignments, 0 for traditional aligns

    if outfile:
        outdir = os.path.abspath(os.path.split(outfile)[0])
    else:
        outdir = TEMPDIR
        outfile = os.path.join(outdir, randomname(12)+ALIGN_SUFFIX)

    query_file, query_is_file = get_path_or_write_file(query, outdir)
    target_file, target_is_file = get_path_or_write_file(target, outdir)

    fasta_prog = Seq.find_exec('fasta35', fastapath)
    if fasta_prog is None:
        raise OSError('fasta35 could not be found')

    # -A Force Smith-Waterman alignment
    # -H Omit Histogram
    # -q Quiet - does not prompt for any input.
    # -m format
    # -z 0 estimates the significance of the match from the mean and standard deviation of the library scores, without correcting for library sequence length.
    # -d number of sequences to display
    # -E maximum expect value to display

    fastacmd = ' '.join("""
    %(fasta_prog)s
    -A
    -H
    -q
    -z 0
    -m %(format)s
    -O %(outfile)s
    %(query_file)s
    %(target_file)s""".split())

    cmd = fastacmd % locals()
    log.info( cmd )

    cmd_output = commands.getoutput(cmd)
    log.debug(cmd_output)

    # check for successful execution
    if not os.access(outfile,os.F_OK):
        log.critical('The following command failed:')
        log.critical(cmd)
        log.critical('...with output:')
        log.critical(cmd_output)
        raise Seq.ExecutionError(cmd_output)

    # parse the data
    data = parseFasta(open(outfile).read())

    if cleanup:
        if not query_is_file:
            os.remove(query_file)
        if not target_is_file:
            os.remove(target_file)
        os.remove(outfile)
        query_file = target_file = outfile = None

    for k in data.keys():
        data[k]['file_q'] = query_file
        data[k]['file_t'] = target_file
        data[k]['file_out'] = outfile

    return data
コード例 #20
0
def posterior(seq, emission_mat, transition_mat, k_counter, seeds, rec_num,
              counter):
    """
    calculates the most probable state for every base in seq
    :param seq: sequence
    :param emission_mat
    :param transition_mat
    :param k_counter: num of states
    :return: seq of states, aligned to original seq
    """
    N = len(seq)
    forward_table = forward(seq, emission_mat, transition_mat, k_counter)
    backward_table = backward(seq, emission_mat, transition_mat, k_counter)
    posterior_table = forward_table + backward_table
    # motif_order = EMPTY_STRING

    seq_obj = Seq(N - 2, rec_num)

    last_motif_0 = FIRST_MOTIF_STATE + len(seeds[0]) - 1
    first_motif_1 = last_motif_0 + 1
    last_motif_1 = first_motif_1 + len(seeds[1]) - 1
    first_motif_2 = last_motif_1 + 1
    last_motif_2 = first_motif_2 + len(seeds[2]) - 1

    # decide states
    for j in range(1, N - 1):
        curr_k = int(np.argmax(posterior_table[:, j]))

        if FIRST_MOTIF_STATE <= curr_k <= last_motif_0:
            # motif_order += MOTIF_0
            seq_obj.add_motif_base(0, (seq[j], curr_k - FIRST_MOTIF_STATE),
                                   j - 1)

        elif first_motif_1 <= curr_k <= last_motif_1:
            # motif_order += MOTIF_1
            seq_obj.add_motif_base(1, (seq[j], curr_k - first_motif_1), j - 1)

        elif first_motif_2 <= curr_k <= last_motif_2:
            # motif_order += MOTIF_2
            seq_obj.add_motif_base(2, (seq[j], curr_k - first_motif_2), j - 1)

        elif curr_k == 2:
            # motif_order += TELO_BACKGROUND
            seq_obj.add_telo_background(seq[j], j - 1)
        elif curr_k == 1:
            # motif_order += 'P'
            seq_obj.add_pre_telo((seq[j], curr_k))
        else:
            # motif_order += BACKGROUND
            seq_obj.add_normal_dna_base((seq[j], curr_k))

    # print_results(seq[1:-1], motif_order)
    seq_obj.print_statistics(doc=None, counter=counter)

    seq_obj.save_to_file()

    return
コード例 #21
0
ファイル: run_infernal_test.py プロジェクト: nhoffman/Seq
import os
import unittest
import logging
import pprint

import config

import Seq

log = logging

module_name = os.path.split(sys.argv[0])[1].rstrip('.py')
outputdir = config.outputdir
datadir = config.datadir

cmbuild_path = Seq.find_exec('cmbuild')

class TestInfernalInstalled(unittest.TestCase):
    def test1(self):
        if not cmbuild_path:
            log.error('Infernal software could not be found - skipping tests in this module')

class Test_Run(unittest.TestCase):

    def setUp(self):
        self.funcname = '_'.join(self.id().split('.')[-2:])
        self.outfile = os.path.join(outputdir,self.funcname)
        self.has_space = os.path.join(outputdir,'name with spaces')
        self.no_space = os.path.join(outputdir,'nameWithoutSpaces')

        os.system('echo `date` has space > "%s"' % self.has_space)