コード例 #1
0
ファイル: filter_reads.py プロジェクト: Charlie-George/cgat
def filter_low_complexity(s, o):
    sread = readread(s)
    biased = 0
    low_complexity = 0
    total_reads = 0
    remaining_reads = 0
    while (sread[0]):
        total_reads += 1
        my_read = Seq(sread[1], generic_dna)
        a = my_read.count("A")
        c = my_read.count("C")
        t = my_read.count("T")
        g = my_read.count("G")
        seq_len = len(my_read)
        count_list = [a, c, t, g]
        if (count_list.count(0) < 2):
            if (max(a, c, t, g) / seq_len < 0.9):
                writeread(sread, o)
                remaining_reads += 1
            else:
                biased += 1
        else:
            low_complexity += 1
        sread = readread(s)
    removed = biased + low_complexity
    sys.stderr.write("Total reads processed: %s\\n" % total_reads)
    sys.stderr.write(r"Low complexity reads removed: %s\n" % low_complexity)
    sys.stderr.write(r"Biased reads removed: %s\n" % biased)
    sys.stderr.write(r"Total reads removed: %s\n" % removed)
    sys.stderr.write(r"Total reads remaining: %s\n" % remaining_reads)
コード例 #2
0
ファイル: INI.py プロジェクト: SavinaRoja/challenges
def bio_seq_count_method(data):
    '''
    Uses the count method on the Seq class from BioPython. Includes
    instantiation of the class.
    '''
    dna = Seq(data)
    return dna.count('A'), dna.count('C'), dna.count('G'), dna.count('T')
コード例 #3
0
ファイル: ini.py プロジェクト: paulkarayan/rosalind
def main(dna_sequence):
    sequence = Seq(dna_sequence)

    return "{A} {C} {G} {T}".format(A=sequence.count("A"),
                                   C=sequence.count("C"),
                                   G=sequence.count("G"),
                                   T=sequence.count("T")
    )
コード例 #4
0
def simple2():
    my_seq = Seq("AGTACACTGGT", IUPAC.unambiguous_dna)
    print my_seq.alphabet
    print my_seq[4:12]
    print my_seq[::-1]
    print my_seq.count("A")
    print GC(my_seq)
    print my_seq.transcribe()
    print my_seq.translate()
def main(fname):
    dna = None
    with open(fname) as f:
        dna = f.read().strip()

    seq = Seq(dna)

    for item in ['A','C','G', 'T']:
        print seq.count(item),
コード例 #6
0
ファイル: seq_apps.py プロジェクト: UNOBIOI/bioi_milestone2
def composition(seq):
    seq = seq.upper()
    seq = Seq(seq)
    a_comp = seq.count("A")
    g_comp = seq.count("G")
    c_comp = seq.count("C")
    t_comp = seq.count("T")

    return a_comp, g_comp, c_comp, t_comp
コード例 #7
0
def file_check():

    #Boolean variable used throughout the function, changes to false if there is a fail
    checker = True
    #Variable used to store the user notifications when a check fails.
    response = ''

    #Opening the users inputed file
    f = open('user_seqInput.fa', 'r')
    fline = f.readline()
    secline = f.readline()
    f.close()
    seq = Seq(secline)

    #Counting the valid base pairs in sequence
    Gcount = seq.count('G')
    Ccount = seq.count('C')
    Acount = seq.count('A')
    Tcount = seq.count('T')
    sum = Gcount + Ccount + Acount + Tcount

    #If the sum of the present base pairs does not equal the length of the sequence then it fails
    if '\n' in seq:
        if sum != len(seq) - 1:
            checker = False
            response = response + 'Sequence contains improper characters\n'
    else:
        if sum != len(seq):
            checker = False
            response = response + 'Sequence contains improper characters\n'

    #checks to makes sure the sequence is atleast 30 nucleotides in length
    if len(secline) < 30:
        checker = False
        response = response + 'Minimum of 30 characters required\n'

    #checks to make sure there are no more than 2 new lines  
    #lines = 0
    #with open('user_seqInput.fa', 'r') as in_file:
    #    for line in in_file:
    #        lines += 1
    #if lines > 2:
    #    checker = False
    #    response = response + 'Too many new lines' 
   
    #checks the first line of the fasta file.
    if fline[0] != '>':
        checker = False
        response = response + 'The first line is invalid\n'


    #Returning the boolean variable and the user notifications
    return checker, response
コード例 #8
0
ファイル: gen_align.py プロジェクト: smoitra87/pareto-hmm
def gen_filter(pos,fpath) : 
	for seqr in SeqIO.parse(fpath,"fasta") : 
		seq = Seq("".join([seqr.seq[p] for p in pos]),generic_protein)
		if (seq.count('-')+0.0)/len(seq) > GAP_LIM : 
			continue
		seqr2 = SeqRecord(seq,id=seqr.id)
		yield seqr2
コード例 #9
0
ファイル: INI.py プロジェクト: teju85/programming
def func(seq):
    bs = Seq(seq)
    dna = ['A', 'C', 'G', 'T']
    for d in dna:
        c = str(bs.count(d)) + ' '
        print c,
    print
コード例 #10
0
def main(*args, **kwargs):
    fpath = os.path.join(os.getcwd(),args[-1])
    SH = StrongHold()
    dna = SH.parserDNAFile(fpath)

    seq = Seq(str(dna))
    for elem in ['A','C','G','T']:
        cnt = seq.count(elem)
        print cnt,
コード例 #11
0
def textbox_check(s):
    checker = True
    response = ''
    seq = Seq(s)
    Gcount = seq.count('G')
    Ccount = seq.count('C')
    Acount = seq.count('A')
    Tcount = seq.count('T')
    sum = Gcount + Ccount + Acount + Tcount

    if sum != len(seq):
        checker = False
        response = 'Characters are not valid\n'
 
    if len(seq) < 30:
        checker = False
        response = response + 'Minimum of 30 characters required'
    
    return (checker, response) 
コード例 #12
0
ファイル: validation.py プロジェクト: UNOBIOI/bioi_milestone2
def contentCheck():
    f = open('sequence.fasta', 'r')
    fline = f.readline()
    secline = f.readline()
    f.close()
    seq = Seq(secline)
    Gcount = seq.count('G')
    Ccount = seq.count('C')
    Acount = seq.count('A')
    Tcount = seq.count('T')
    sum = Gcount + Ccount + Acount + Tcount

    if '\n' in seq:
        if sum == len(seq) - 1:
            print 'Sequence contains proper characters'
        else:
            print 'Sequence countains improper characters'
    else:
        if sum == len(seq):
            print 'Sequence contains proper characters'
        else:
            print 'Sequence contains improper characters'
コード例 #13
0
ファイル: gen_align.py プロジェクト: smoitra87/pareto-hmm
def gen_filter2(pos,fpath) : 
	for seqr in SeqIO.parse(fpath,"fasta") : 
		seq = ""
		for p in pos :
			if p == '-' :
				seq += '-'
			else :
				seq += seqr.seq[int(p)]
		seq = Seq(seq,generic_protein)
		if (seq.count('-')+0.0)/len(seq) > GAP_LIM : 
			continue
		seqr2 = SeqRecord(seq,id=seqr.id)
		yield seqr2
コード例 #14
0
ファイル: Kmer.py プロジェクト: CrescentLuo/Amphisbaena
def kmerFreq(isoform):
    K = args.repeat
    sline = isoform.rstrip().split()
    chrom = sline[0]
    start = sline[1]
    end = sline[2]
    exonCnt = int(sline[9])
    exonlen = sline[10].rstrip(',').split(',')
    exonlen = [int(length) for length in exonlen]
    exonS = sline[11].rstrip(',').split(',')
    exonS = [int(s) for s in exonS]
    strand = sline[5]
    gene_seq = records[chrom].seq[int(sline[1]):int(sline[2])].upper()
    gene_seq_str = str(gene_seq)
    spliced_seq = ""
    for i in range(exonCnt):
        spliced_seq = spliced_seq + gene_seq_str[exonS[i]:(exonS[i] + exonlen[i])]
    if strand == '-':
        gene_seq = gene_seq.reverse_complement()
        spliced_seq = Seq(spliced_seq).reverse_complement()
    else:
        spliced_seq = Seq(spliced_seq)
    spliced_length = len(spliced_seq)
    gene_length = len(gene_seq)
    kmer_freq = [0.0] * (4 ** K)
    iso = sline[3]
    for kmer in kmer_dict:
        if args.overlap:
            kmer_freq[kmer_dict[kmer]] = spliced_seq.count_overlap(kmer) + 0.0
        else:
            kmer_freq[kmer_dict[kmer]] = spliced_seq.count(kmer) + 0.0
    if not(args.count):
        for ind,cnt in enumerate(kmer_freq):
            kmer_freq[ind] = cnt / spliced_length * 1000 
    kmer_freq = [str(freq) for freq in kmer_freq]  
    return iso+'\t'+'\t'.join(kmer_freq)+'\n'
コード例 #15
0
ファイル: miRProbeDesigner.py プロジェクト: mariogiov/lic
	def Expand_probe(self, probe_set):
		iter = 0
		temp_dict = {}
		end = 17 ### Andra for att fa langre eller kortare region
		while iter < len(probe_set):
			keylist = list(probe_set.keys())
			startkey = keylist[iter]
			keylist = keylist[iter+1:len(probe_set)+1]
			for rec in keylist:
				temp_list = [startkey[0:end], rec[0:end]]
				temp_seq = ''
				for base in range(end):
					if startkey[base] == rec[base]:
						temp_seq = temp_seq + rec[base]
					else:
						temp_seq = temp_seq + 'N'
				temp_Seq = Seq(temp_seq, generic_dna)
				if temp_Seq.count('N') < 8: ### Andra for att tillata fler eller farre N i regionen
					if temp_seq not in temp_dict.keys():
						temp_set = set(probe_set[startkey] + probe_set[rec])
						temp_dict[temp_seq] = list(temp_set)
					else:
						temp_set = set(temp_dict[temp_seq] + probe_set[startkey] + probe_set[rec])
						temp_dict[temp_seq] = list(temp_set)
				else:
					temp_list = [startkey, rec]
					for key in temp_list:
						if key not in temp_dict.keys():
							temp_dict[key] = probe_set[key]
						else:
							temp_set = set(temp_dict[key] + probe_set[key])
							temp_dict[key] = list(temp_set)
			iter = iter + 1
		#print temp_dict
		#print len(temp_dict)
		return temp_dict
コード例 #16
0
ファイル: ProtParam.py プロジェクト: sgalpha01/biopython
class ProteinAnalysis:
    """Class containing methods for protein analysis.

    The constructor takes two arguments.
    The first is the protein sequence as a string, which is then converted to a
    sequence object using the Bio.Seq module. This is done just to make sure
    the sequence is a protein sequence and not anything else.

    The second argument is optional. If set to True, the weight of the amino
    acids will be calculated using their monoisotopic mass (the weight of the
    most abundant isotopes for each element), instead of the average molecular
    mass (the averaged weight of all stable isotopes for each element).
    If set to false (the default value) or left out, the IUPAC average
    molecular mass will be used for the calculation.

    """
    def __init__(self, prot_sequence, monoisotopic=False):
        """Initialize the class."""
        if prot_sequence.islower():
            self.sequence = Seq(prot_sequence.upper())
        else:
            self.sequence = Seq(prot_sequence)
        self.amino_acids_content = None
        self.amino_acids_percent = None
        self.length = len(self.sequence)
        self.monoisotopic = monoisotopic

    def count_amino_acids(self):
        """Count standard amino acids, return a dict.

        Counts the number times each amino acid is in the protein
        sequence. Returns a dictionary {AminoAcid:Number}.

        The return value is cached in self.amino_acids_content.
        It is not recalculated upon subsequent calls.
        """
        if self.amino_acids_content is None:
            prot_dic = {k: 0 for k in IUPACData.protein_letters}
            for aa in prot_dic:
                prot_dic[aa] = self.sequence.count(aa)

            self.amino_acids_content = prot_dic

        return self.amino_acids_content

    def get_amino_acids_percent(self):
        """Calculate the amino acid content in percentages.

        The same as count_amino_acids only returns the Number in percentage of
        entire sequence. Returns a dictionary of {AminoAcid:percentage}.

        The return value is cached in self.amino_acids_percent.

        input is the dictionary self.amino_acids_content.
        output is a dictionary with amino acids as keys.
        """
        if self.amino_acids_percent is None:
            aa_counts = self.count_amino_acids()

            percentages = {}
            for aa in aa_counts:
                percentages[aa] = aa_counts[aa] / float(self.length)

            self.amino_acids_percent = percentages

        return self.amino_acids_percent

    def molecular_weight(self):
        """Calculate MW from Protein sequence."""
        return molecular_weight(self.sequence,
                                seq_type="protein",
                                monoisotopic=self.monoisotopic)

    def aromaticity(self):
        """Calculate the aromaticity according to Lobry, 1994.

        Calculates the aromaticity value of a protein according to Lobry, 1994.
        It is simply the relative frequency of Phe+Trp+Tyr.
        """
        aromatic_aas = "YWF"
        aa_percentages = self.get_amino_acids_percent()

        aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas)

        return aromaticity

    def instability_index(self):
        """Calculate the instability index according to Guruprasad et al 1990.

        Implementation of the method of Guruprasad et al. 1990 to test a
        protein for stability. Any value above 40 means the protein is unstable
        (has a short half life).

        See: Guruprasad K., Reddy B.V.B., Pandit M.W.
        Protein Engineering 4:155-161(1990).
        """
        index = ProtParamData.DIWV
        score = 0.0

        for i in range(self.length - 1):
            this, next = self.sequence[i:i + 2]
            dipeptide_value = index[this][next]
            score += dipeptide_value

        return (10.0 / self.length) * score

    def flexibility(self):
        """Calculate the flexibility according to Vihinen, 1994.

        No argument to change window size because parameters are specific for
        a window=9. The parameters used are optimized for determining the
        flexibility.
        """
        flexibilities = ProtParamData.Flex
        window_size = 9
        weights = [0.25, 0.4375, 0.625, 0.8125, 1]
        scores = []

        for i in range(self.length - window_size):
            subsequence = self.sequence[i:i + window_size]
            score = 0.0

            for j in range(window_size // 2):
                front = subsequence[j]
                back = subsequence[window_size - j - 1]
                score += (flexibilities[front] +
                          flexibilities[back]) * weights[j]

            middle = subsequence[window_size // 2 + 1]
            score += flexibilities[middle]

            scores.append(score / 5.25)

        return scores

    def gravy(self, scale="KyteDoolitle"):
        """Calculate the GRAVY (Grand Average of Hydropathy) according to Kyte and Doolitle, 1982.

        Utilizes the given Hydrophobicity scale, by default uses the original
        proposed by Kyte and Doolittle (KyteDoolitle). Other options are:
        Aboderin, AbrahamLeo, Argos, BlackMould, BullBreese, Casari, Cid,
        Cowan3.4, Cowan7.5, Eisenberg, Engelman, Fasman, Fauchere, GoldSack,
        Guy, Jones, Juretic, Kidera, Miyazawa, Parker,Ponnuswamy, Rose,
        Roseman, Sweet, Tanford, Wilson and Zimmerman.

        New scales can be added in ProtParamData.
        """
        selected_scale = ProtParamData.gravy_scales.get(scale, -1)

        if selected_scale == -1:
            raise ValueError(f"scale: {scale} not known")

        total_gravy = sum(selected_scale[aa] for aa in self.sequence)

        return total_gravy / self.length

    def _weight_list(self, window, edge):
        """Make list of relative weight of window edges (PRIVATE).

        The relative weight of window edges are compared to the window
        center. The weights are linear. It actually generates half a list.
        For a window of size 9 and edge 0.4 you get a list of
        [0.4, 0.55, 0.7, 0.85].
        """
        unit = 2 * (1.0 - edge) / (window - 1)
        weights = [0.0] * (window // 2)

        for i in range(window // 2):
            weights[i] = edge + unit * i

        return weights

    def protein_scale(self, param_dict, window, edge=1.0):
        """Compute a profile by any amino acid scale.

        An amino acid scale is defined by a numerical value assigned to each
        type of amino acid. The most frequently used scales are the
        hydrophobicity or hydrophilicity scales and the secondary structure
        conformational parameters scales, but many other scales exist which
        are based on different chemical and physical properties of the
        amino acids.  You can set several parameters that control the
        computation of a scale profile, such as the window size and the window
        edge relative weight value.

        WindowSize: The window size is the length of the interval to use for
        the profile computation. For a window size n, we use the i-(n-1)/2
        neighboring residues on each side to compute the score for residue i.
        The score for residue i is the sum of the scaled values for these
        amino acids, optionally weighted according to their position in the
        window.

        Edge: The central amino acid of the window always has a weight of 1.
        By default, the amino acids at the remaining window positions have the
        same weight, but you can make the residue at the center of the window
        have a larger weight than the others by setting the edge value for the
        residues at the beginning and end of the interval to a value between
        0 and 1. For instance, for Edge=0.4 and a window size of 5 the weights
        will be: 0.4, 0.7, 1.0, 0.7, 0.4.

        The method returns a list of values which can be plotted to view the
        change along a protein sequence.  Many scales exist. Just add your
        favorites to the ProtParamData modules.

        Similar to expasy's ProtScale:
        http://www.expasy.org/cgi-bin/protscale.pl
        """
        # generate the weights
        #   _weight_list returns only one tail. If the list should be
        #   [0.4,0.7,1.0,0.7,0.4] what you actually get from _weights_list
        #   is [0.4,0.7]. The correct calculation is done in the loop.
        weights = self._weight_list(window, edge)
        scores = []

        # the score in each Window is divided by the sum of weights
        # (* 2 + 1) since the weight list is one sided:
        sum_of_weights = sum(weights) * 2 + 1

        for i in range(self.length - window + 1):
            subsequence = self.sequence[i:i + window]
            score = 0.0

            for j in range(window // 2):
                # walk from the outside of the Window towards the middle.
                # Iddo: try/except clauses added to avoid raising an exception
                # on a non-standard amino acid
                try:
                    front = param_dict[subsequence[j]]
                    back = param_dict[subsequence[window - j - 1]]
                    score += weights[j] * front + weights[j] * back
                except KeyError:
                    sys.stderr.write(
                        "warning: %s or %s is not a standard "
                        "amino acid.\n" %
                        (subsequence[j], subsequence[window - j - 1]))

            # Now add the middle value, which always has a weight of 1.
            middle = subsequence[window // 2]
            if middle in param_dict:
                score += param_dict[middle]
            else:
                sys.stderr.write(
                    f"warning: {middle} is not a standard amino acid.\n")

            scores.append(score / sum_of_weights)

        return scores

    def isoelectric_point(self):
        """Calculate the isoelectric point.

        Uses the module IsoelectricPoint to calculate the pI of a protein.
        """
        aa_content = self.count_amino_acids()

        ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
        return ie_point.pi()

    def charge_at_pH(self, pH):
        """Calculate the charge of a protein at given pH."""
        aa_content = self.count_amino_acids()
        charge = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
        return charge.charge_at_pH(pH)

    def secondary_structure_fraction(self):
        """Calculate fraction of helix, turn and sheet.

        Returns a list of the fraction of amino acids which tend
        to be in Helix, Turn or Sheet.

        Amino acids in helix: V, I, Y, F, W, L.
        Amino acids in Turn: N, P, G, S.
        Amino acids in sheet: E, M, A, L.

        Returns a tuple of three floats (Helix, Turn, Sheet).
        """
        aa_percentages = self.get_amino_acids_percent()

        helix = sum(aa_percentages[r] for r in "VIYFWL")
        turn = sum(aa_percentages[r] for r in "NPGS")
        sheet = sum(aa_percentages[r] for r in "EMAL")

        return helix, turn, sheet

    def molar_extinction_coefficient(self):
        """Calculate the molar extinction coefficient.

        Calculates the molar extinction coefficient assuming cysteines
        (reduced) and cystines residues (Cys-Cys-bond)
        """
        num_aa = self.count_amino_acids()
        mec_reduced = num_aa["W"] * 5500 + num_aa["Y"] * 1490
        mec_cystines = mec_reduced + (num_aa["C"] // 2) * 125
        return (mec_reduced, mec_cystines)
コード例 #17
0
# get the sequence alphabet
my_seq.alphabet   


# print the position, sequence and lenght
for index, letter in enumerate(my_seq) :
    print index, letter
    print len(letter)

# access elements of the sequence
print my_seq[0] #first element
print my_seq[2] #third element
print my_seq[-1] #last element

# count
my_seq.count('G')

# GC percentage   (#G + #C) / #Total
from Bio.SeqUtils import GC
GC(my_seq)


# slicing
my_seq[4:12]   # includes 4th, excludes 12th

# reversing:
my_seq[::-1]

# convert to string
my_seq.tostring()
コード例 #18
0
ファイル: rosalind_ini.py プロジェクト: craigfay/Rosalind
def base_frequency(sequence):
    s = Seq(sequence)
    print(s.count('A'), s.count('C'), s.count('G'), s.count('T'))
コード例 #19
0
ファイル: Counting.py プロジェクト: mikegloudemans/rosalind
from Bio.Seq import Seq

if __name__ == '__main__':
    f = open("data.txt")
    data = f.readline()
    mySeq = Seq(data)
    print str(mySeq.count("A")) + " " + str(mySeq.count("C")) + " " + str(mySeq.count("G")) + " " + str(mySeq.count("T"))
コード例 #20
0
ファイル: ACGTcount.py プロジェクト: kimalaacer/Rosalind
from Bio.Seq import Seq
my_seq = Seq("AGTACACTGGT")
my_seq.count("A"),my_seq.count("C"),my_seq.count("G"),my_seq.count("T")
コード例 #21
0
ファイル: dna.py プロジェクト: even4void/rosalind
# number of times that the symbols 'A', 'C', 'G', and 'T' occur in s.

from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

raw = "AGAAGTCCTATAGATTGGATCCAATTAGCGTAGAAGTCTGTAAGCGAT \
       ACGACCCCCTTTAGTCGAGTGTCGCCTGAGGGCTTATGTTCTTCGGCG \
       CCCGTAGTGGTGCCAGTTGATTGTCAGCTCCTGACATGGCTACTTCTA \
       ATTGTCTTTTATTACGGGGGGGCCGAGCCTCAGTGTGCCCCCCCCTGT \
       ATTAGCGACAGATTTACCTCTTCGTTAGAAGGAACAACCGAGAGAAGG \
       CCAGGGTGATTAATGCAAGCGGAGCTGACTTGCATCTTTCCTATTAAT \
       AGTATTGACTCCCGTTCCAAAGCGCCGATTATCGGGCACATGTTGATT \
       CATAGGCAACTATGAAATTGAAAGTTATAATGACTGGAAAGGCGGGCA \
       AGACGGCACTCAAGGCACGGACAACGAGCCCGCGAAATCAACTATTGT \
       AGCCGCGATAAACTTAAATTGGAGTAGCGTGGCGTCCGGAATCCAAGA \
       CGGCATACGGGGGTAACAAATGCTAGAAAGATGGAACGCCCCTGAAGT \
       CCCAAGCAAAGGGCCAAATTAACGGTTCCTGCACTACTGCCCGGGGCA \
       GGTCACCTCCTTCTCCATACTCCAATACATGCTGCAGGTAGGCCGATA \
       CTTTACGTGCCCAACGATGACGTTGATTAACCAGCCGTCTCGCAGGGA \
       TACGGCATCGTGAGAGGTGGCCTAGCTGCACAGGCCCCAGTTCATGCT \
       TAAGGTACTCGGTATACAGGAGTGCTAGGCTTATTACTAGCAACCGCC \
       AGTTAACTAACAATATAGTAGCCCCAAGACGGTATTCGCGCCCCTGGT \
       TGCCTTGAGTGTGTATCCGTTCTGCCATGGCACGCTCATAGGGCGATA \
       TCGATACATAACTCACGCACTTAAGGCCACGGGCACGACATGACATAA \
       CACTCTAACCTCTCTTCAATGGCCTATTTCGGCAGCTGGGAAGATGGT \
       GAGATCGGTTTTCGGTGAATATGCCATCA"

s = Seq(raw.replace(" ", ""), generic_dna)

[s.count(x) for x in ["A", "C", "G", "T"]]
コード例 #22
0
import Bio.Alphabet
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq
my_seq = Seq("AGTACACCTGGT")
print(my_seq.complement())
print(my_seq.reverse_complement())
my_seq.count("T")
GC_count = 100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq)
print(GC_count)
コード例 #23
0
from Bio.Seq import Seq
#take in a sequence and count ACGT
my_seq = Seq(
    "CTGCTACTCTACTACTTCTTGAGTCGCAACGACTTCTGATAGGTCAATACAATACCTAGCCAACTTATGTCCTGGCGGTAAATGACGGTCAACGTAATGGTCGCCGATAACAGTACGAACAAGGATTATCTTTAAGCGAAGTTAGACCAAGAGCGTGGCACCACCTCTGACATGCATGATGGAGATTTGCACGCAGTTAACGTGAGAGCTTTCGGATGACATGACTGGAACGACAGATCTTACGTACCCTTAACAACACATTGCATAGTGTTCGAACCGTCCAACTATGTCCGTAGGACATTAGAGGGTGCTGGATCCCACTTACGCGCTAATTTCTCCGACTGGTTATCAGTATGATATATTCGCTTGGTCTCCTGCCCATGGGCGTTATACTACAGATTCACACCACGCTACGTTCCCGCGTTCGGTCGGATGGGCTCGCAGAATCTACAACGTCGCAGAAATATAAGGGGTCCCCCCCAATGAAATGTGGATTCCCTATGCAATGAGCGCCACTCCCAGCAATTGTCTGATTATTGCTGGAAGTCCTGGCGCCGATTAACAGCCTCTATAGTTTGACTTTTAGATGTAGGTGGTGCAATAACCTCACCCAGTCCTGCATCTGGGTTATAAATATCAAATTCGGAAGCATCTCCTCCCTTAAATGACGCACGTGGGGTTCGCGCACCCATATTGTATGCGTCAGGGCGAGCAAACCTATTCATACTTCTAGTCGGCGTGAATACAGCCGGTCGAGCTTCTTCTGGCGGTCGAAAAATGATATTAAGCCCCGAGCAACGCTGATTAAGCCGCCTACGTACTGGAACTCATGTTCCTAACTCCAGCCCCGAATTCGAGTATCATCCTTTCGTTTGGACC"
)
print(my_seq.count("A")), print(my_seq.count("G")), print(
    my_seq.count("T")), print(my_seq.count("C"))
コード例 #24
0
def get_nucleotide_counts(dna_sequence):
    nucleotide_counts = {"A": 0, "G": 0, "T": 0, "C": 0}
    my_seq = Seq(dna_sequence)
    for key in list(nucleotide_counts.keys()):
        nucleotide_counts[key] = my_seq.count(key)
    return sort_dict(nucleotide_counts)
コード例 #25
0
class ProteinAnalysis:
    """Class containing methods for protein analysis.

    The class init method takes only one argument, the protein sequence as a
    string and builds a sequence object using the Bio.Seq module. This is done
    just to make sure the sequence is a protein sequence and not anything else.
    
    """
    def __init__(self, ProtSequence):
        if ProtSequence.islower():
            self.sequence = Seq(ProtSequence.upper(), IUPAC.protein)
        else:
            self.sequence = Seq(ProtSequence, IUPAC.protein)
        self.amino_acids_content = None
        self.amino_acids_percent = None
        self.length = len(self.sequence)
        
    def count_amino_acids(self):
        """Count standard amino acids, returns a dict.
            
        Simply counts the number times an amino acid is repeated in the protein
        sequence. Returns a dictionary {AminoAcid:Number} and also stores the
        dictionary in self.amino_acids_content.
        """
        ProtDic = dict([ (k, 0) for k in IUPACData.protein_letters])
        for i in ProtDic:
            ProtDic[i]=self.sequence.count(i)
        self.amino_acids_content = ProtDic
        return ProtDic
    
    def get_amino_acids_percent(self):
        """Calculate the amino acid content in percents.

        The same as count_amino_acids only returns the Number in percentage of
        entire sequence. Returns a dictionary and stores the dictionary in
        self.amino_acids_content_percent.
        
        input is the dictionary from CountAA.
        output is a dictionary with AA as keys.
        """
        if not self.amino_acids_content:
            self.count_amino_acids()
                
        PercentAA = {}
        for i in self.amino_acids_content:
            if self.amino_acids_content[i] > 0:
                PercentAA[i]=self.amino_acids_content[i]/float(self.length)
            else:
                PercentAA[i] = 0
        self.amino_acids_percent = PercentAA
        return PercentAA

    def molecular_weight (self):
        """Calculate MW from Protein sequence"""
        # make local dictionary for speed
        MwDict = {}
        # remove a molecule of water from the amino acid weight.
        for i in IUPACData.protein_weights:
            MwDict[i] = IUPACData.protein_weights[i] - 18.02
        MW = 18.02 # add just one water molecule for the whole sequence.
        for i in self.sequence:
            MW += MwDict[i]
        return MW

    def aromaticity(self):
        """Calculate the aromaticity according to Lobry, 1994.

        Calculates the aromaticity value of a protein according to Lobry, 1994.
        It is simply the relative frequency of Phe+Trp+Tyr.
        """
        if not self.amino_acids_percent:
            self.get_amino_acids_percent()
        
        Arom= self.amino_acids_percent['Y']+self.amino_acids_percent['W']+self.amino_acids_percent['F']
        return Arom

    def instability_index(self):
        """Calculate the instability index according to Guruprasad et al 1990.

        Implementation of the method of Guruprasad et al. 1990 to test a
        protein for stability. Any value above 40 means the protein is unstable
        (has a short half life). 
        
        See: Guruprasad K., Reddy B.V.B., Pandit M.W.
        Protein Engineering 4:155-161(1990).
        """
        #make the dictionary local for speed.
        DIWV=ProtParamData.DIWV.copy()
        score=0.0
        for i in range(self.length - 1):
            DiPeptide=DIWV[self.sequence[i]][self.sequence[i+1]]
            score += DiPeptide
        return (10.0/self.length) * score

    def flexibility(self):
        """Calculate the flexibility according to Vihinen, 1994.
        
        No argument to change window size because parameters are specific for a
        window=9. The parameters used are optimized for determining the flexibility.
        """
        Flex = ProtParamData.Flex.copy()
        Window=9
        Weights=[0.25,0.4375,0.625,0.8125,1]
        List=[]
        for i in range(self.length - Window):
            SubSeq=self.sequence[i:i+Window]
            score = 0.0
            for j in range(Window//2):
                score += (Flex[SubSeq[j]]+Flex[SubSeq[Window-j-1]]) * Weights[j]
            score += Flex[SubSeq[Window//2+1]]
            List.append(score/5.25)
        return List

    def gravy(self):
        """Calculate the gravy according to Kyte and Doolittle."""
        ProtGravy=0.0
        for i in self.sequence:
            ProtGravy += kd[i]
            
        return ProtGravy/self.length

    # this method is used to make a list of relative weight of the
    # window edges compared to the window center. The weights are linear.
    # it actually generates half a list. For a window of size 9 and edge 0.4
    # you get a list of [0.4, 0.55, 0.7, 0.85]. 
    def _weight_list(self, window, edge):
        unit = ((1.0-edge)/(window-1))*2
        list = [0.0]*(window//2)
        for i in range(window//2):
            list[i] = edge + unit * i
        return list
    
    # The weight list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4]
    # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done
    # in the loop.
    def protein_scale(self, ParamDict, Window, Edge=1.0):
        """Compute a profile by any amino acid scale.
        
        An amino acid scale is defined by a numerical value assigned to each type of
        amino acid. The most frequently used scales are the hydrophobicity or
        hydrophilicity scales and the secondary structure conformational parameters
        scales, but many other scales exist which are based on different chemical and
        physical properties of the amino acids.  You can set several  parameters that
        control the computation  of a scale profile, such as the window size and the
        window edge relative weight value.  WindowSize: The window size is the length
        of the interval to use for the profile computation. For a window size n, we
        use the i- ( n-1)/2 neighboring residues on each side of residue it compute
        the score for residue i. The score for residue is  the sum of the scale values
        for these amino acids,  optionally weighted according to their position in the
        window.  Edge: The central amino acid of the window always has a weight of 1.
        By default, the amino acids at the remaining window positions have the same
        weight, but  you can make the residue at the center of the window  have a
        larger weight than the others by setting the edge value for the  residues at
        the beginning and end of the interval to a value between 0 and 1. For
        instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7,
        1.0, 0.7, 0.4.  The method returns a list of values which can be plotted to
        view the change along a protein sequence.  Many scales exist. Just add your
        favorites to the ProtParamData modules.

        Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl
        """
        # generate the weights
        weight = self._weight_list(Window,Edge)
        list = []
        # the score in each Window is divided by the sum of weights
        sum_of_weights = 0.0
        for i in weight: sum_of_weights += i
        # since the weight list is one sided:
        sum_of_weights = sum_of_weights*2+1
        
        for i in range(self.length-Window+1):
            subsequence = self.sequence[i:i+Window]
            score = 0.0
            for j in range(Window//2):
                # walk from the outside of the Window towards the middle.
                # Iddo: try/except clauses added to avoid raising an exception on a non-standad amino acid
                    try:
                        score += weight[j] * ParamDict[subsequence[j]] + weight[j] * ParamDict[subsequence[Window-j-1]]
                    except KeyError:
                        sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' %
                                 (subsequence[j],subsequence[Window-j-1]))

            # Now add the middle value, which always has a weight of 1.
            if subsequence[Window//2] in ParamDict:
                score += ParamDict[subsequence[Window//2]]
            else:
                sys.stderr.write('warning: %s  is not a standard amino acid.\n' % (subsequence[Window//2]))
        
            list.append(score/sum_of_weights)
        return list

    def isoelectric_point(self):
        """Calculate the isoelectric point.
        
        This method uses the module IsoelectricPoint to calculate the pI of a protein.
        """
        if not self.amino_acids_content:
            self.count_amino_acids()
        X = IsoelectricPoint.IsoelectricPoint(self.sequence, self.amino_acids_content)
        return X.pi()
        
    def secondary_structure_fraction (self):
        """Calculate fraction of helix, turn and sheet.
        
        This methods returns a list of the fraction of amino acids which tend
        to be in Helix, Turn or Sheet.
        
        Amino acids in helix: V, I, Y, F, W, L.
        Amino acids in Turn: N, P, G, S.
        Amino acids in sheet: E, M, A, L.
        
        Returns a tuple of three integers (Helix, Turn, Sheet).
        """
        if not self.amino_acids_percent:
            self.get_amino_acids_percent()
        Helix = self.amino_acids_percent['V'] + self.amino_acids_percent['I'] + self.amino_acids_percent['Y'] + self.amino_acids_percent['F'] + self.amino_acids_percent['W'] + self.amino_acids_percent['L']
        Turn = self.amino_acids_percent['N'] + self.amino_acids_percent['P'] + self.amino_acids_percent['G'] + self.amino_acids_percent['S']
        Sheet = self.amino_acids_percent['E'] + self.amino_acids_percent['M'] + self.amino_acids_percent['A'] + self.amino_acids_percent['L']
        return Helix, Turn, Sheet
コード例 #26
0
def count_GC():
    long_seq = ''.join(all_contig)
    seq = Seq(long_seq, IUPAC.unambiguous_dna)
    GC_content = 100 * ((seq.count("G") + seq.count("C")) / len(long_seq))
    return '%.2f%%' % GC_content
コード例 #27
0
#Ficheiro FASTA
phbfa = SeqIO.read(
    open(
        "C:/Users/Zé Freitas/Desktop/Mestrado/Labs_Bioinf/Trabalho prático/scripts/Labs_Bioinf/PHB/sequence.fasta"
    ), "fasta")
phbfa

my_seq = Seq(
    "GCAGTATGTGTGGTTGGGGAATTCATGTGGAGGTCAGAGTGGAAGCAGGTGAGAATGGAGGGGGCGGCAAAGGCTCGTTTCTGGGCATCTCTGCAGTCCTCCTCTGCTCCATGATGTGCACTTTGGGCGAGGAGAGTGCGTGCGTGAGTCCGACTTGTGAGGGAGGGGAGAAGGGGCTGAGCCCGGGACGAGCCAGGGGTTGCTCAGAGTAAGGGAGGTGTCCATGGAGGCAGGGTGAGGAATAATCCAGAAGCTATTACAAATGTAAAGGGCCGGGTGTCCCAGCCTCAGAGAAGGAAGATTTAAATGCACTGGACGAGATCAGGGTAGTCTCAGGAGTTGAGGTCTGGGAAGTAGGGAGGGAGGATTTGAGACTGGAGCGGGCAACGACGGTGGGGCGGAGCGTTAGAAAGTTACATGCTGGCGTGATTTCTAGTTAGGTCAACTGTGCTTATGCCCACCCCGCCTCAGCCCCACCCTCCCAGTTATTCCAGAGCTCACTGTCCCTGTGCAGCTAGTTAGAGCCTTTCTCCCAAATGGGTTCTTCAGTTATCTTGGCCCCAGGATGTCATCCAGCTCCTGCTTCCATAAGAAGCATGTCGTTCTTAATACACGATGTTGACAAGCAGTATGGTGAGGAGGTAAGCTGTGTCTGCTAGCATTAGACCTCTGGGTTCTAATTCTGGTTCTACCACTTAATAACTGCAATCTCGGCTTCTCATGTAACCTCTCTGTGTGCCTCTGTTTCCTCTGTAGTAATATGCTTCATAGGGTAATTGTGAGAAGTAAATAAATTGCTTTTATTAGGCTACCTGATATAAGTGTTAGCTGTTACGGTTACTTTTTTTGTTGGCATCAACATGTAGCACATTTTTTAAGTTATTTTTTTCAAACCATAATTGCACCAATCTAACCTCACAGCCTCTTTTTGGGGGCCTACTTGTCCAGGAAATGAGAGGGTGGTTTAGTGTGGTGCTAAGTTCTCTGTGGATTTCAAGCCCATGCATTGTTTTCATTATTGAACCAAGTGTCCCAGACACCTTACTTTAAATGGTTGAGAAAAAAAGAGAAATCAGCCAGGCATGATGGCTCATGCCTGTAATCCCAACACTTGGGAGGCCGAGGCAGGGGGATCACTTGAGCCCAGGAGTTTGAGACCACCTGGGGCAACGTAGCAAGACCCCATCTCTGCAAAAAATGAACAAAATTAGCCGGGCATGGTGGCACACTTCTGTGGTCCCAGCTACTTGGGAGGTTGAGGTGAGAAGATCGCTTGAGCCTGGGAGGTCGAGGCTTCAGTGAGCTGAGATTGCACCACTGCACTCCAGCCTGGGTGACAGAGCAAGACCCTGTCTCCAAAAAAAAAAAAAGGAAAGAAAAGAAACTGAAAAAAAAAAAAAGCAGAAGAATTGATAGTACACTTTCCAAGCTATAAAGCATTATTTATTAGGTATCCTTCAATGGATGATTTAGCACTTTCAGGAATGGGGAAATAAATAGCCAGGTTGAAAAGTGACTGTTGTGTGTCAGAGAGGGCCTTCTCTGAGGATTTGGCATCAAGTTTGATTGTATTTTGTTTTTATCCCCTTAGGTGTGAGAGGGTCCAGCAGAAGGAAACATGGCTGCCAAAGTGTTTGAGTCCATTGGCAAGTTTGGCCTGGCCTTAGCTGTTGCAGGAGGCGTGGTGAACTCTGCCTTATATAATGGTGAGGCATGGAGGGACAGTGGGTCACTGCACTTTCCTAGGAGTTTTCTGTTGGTCTGCATAGCCCATGTGACACTCTTGATGGTAGCTGCCGTCAGTGAATGTGTTTGTGGCCAAGAGGGCTCACCTCCTGCCATTTCATACCACAGGACTGCATTGTTATCAGAGCCCCTGACCTTTCAGTCATAGGTTCTCTCAGAGCCTGTATTCAAAAAGAGCTTCCCAGCCCACTTCCTAGTTGGATGTGTCCAGTGGCTTCTGTCAAGGTGAAGTGAAGCCGCACCACCCAAATGCTGCCGCACAGTGTCTGGATTTCCCTGGCTATCTGAAATGGAGATCTCATTTGTTCTCCTCTGCTTGCATGTGGAATAACAGCAAAGGCTGCAGATCTGTTTGGGTGACCTTGTCCTGAACAGGAACTTTTGCTGTGCTGAATTCGGGTAGTTTCAGAGAAAGTATCTTTGAGATGCATTGCCCAGCTTTTAACAGTGTAGGAGGGAGGTTAAGCTGGCTTTTCTTCCACTTTACTGTGGAAGCTTCCTCATTGGTCAAGCAATGGATTTGACCTGACTTTATCTGTAGGACCTCCTTTAATTCTGACATTCTGACACTTTCACATGCTGCAAAGCAGCAATAGATTGACCCATCCGGTGTGTGGCTGGCTGACAAGAGGAGCTTTACTTTCAGAGTGAAGATATTTGGACCAATGATAAAGTTCAGAGAGGCAGCTGATTAGAAAAGCCTGCTTGGCTTATATGACACATCTTAGCAGTACTGTGATCCTTTTGGCCACATCTGCAACTAGACAGAAATTGCCATCATAAATTTCTCTCTGTGCCAAGACAGCTCTATAACCCCTTAAAACTTTAGCGAAACAGAGCTATTAGGAAGAAAGAGTAGGCTCTTCGAAATGTAGGATTCCCATAATGAGGGTGCTACTTCTGGGAGCACTAGGTTAAATTGGAGCCCGATGGATATGTGGTAACTGGGAAGACCTCACTGAGTTTTAGAAGTTTTGGTAGATGATTCCTGGAAATATGTTGGTGGGAGTTCAGGGATAGAATGGTCATTCAGAAAATCAACAGCCAGTTCCCTCAAGGAGAAAGGATGCTAAGGAACAGGTCCTATTACCAATCCTTGGGGACATGTGGAACAGGAAGTGACTGCTTAGTTTTGCAGCTAGTTAGAAGTCTCTAGAGACCAGGAGTTGGGGAAGACAGAGAGAAGAGGGGAGACTTAATAAGTGAACAGAAAGCACCAGGGCTCTTTCAAAGACATGATCCTTTTGTTTAAAGGATGAGAGGATTTTTATGACATGTCATTGTCCTTTCTTCCTAGTGGATGCTGGGCACAGAGCTGTCATCTTTGACCGATTCCGTGGAGTGCAGGACATTGTGGTAGGGGAAGGGACTCATTTTCTCATCCCGTGGGTACAGAAACCAATTATCTTTGACTGCCGTTCTCGACCACGTAATGTGCCAGTCATCACTGGTAGCAAAGGTGAGTCTTGCCTATGGTTCAGGTAAAGTAGGGAGTGTGGAAGAGGTGCTCTGTTCTTCTGTGTCACAGGAGCATCTGTGGGATACCAGGATCCAAAAGAGTTTGAACTGTACATCATAGGAATGACTAGACTACTTGCCCTGGAGAGCTTGATATGGAATCTTAGAAATACCCACTTATGGCTGGGTGCGGTGGTTCATGCCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAGGTGGATCACCTGAGGTCAGGAGTTCAAGACCAGCCTGGCCAATGTGGTGAAACCCCATCTCTCCTAAAAATACAAAAATTAGCCGGTGGTGGGGGGTGCCTGTAATCCCAGCTACTTGAGAGGCTGAGGCAGGAGAATTGCTTGAGCCTGGGGGGCGGAGGTTGCAGTGAGCTAAGATTGTGCCACTTCACTGCAGCCTGGGCAACAGAGTGAGACTCTGTCTCAAAAAAAAAAAAAAAAAAAGCCTGCTTCTAATCTTCCCATCTCTTTGGAATTTCTTTCCGTACTGTTTTGCAGTTGTTTTCAGGATACATTATGTACCTATTTCTAAAACTATTGATAGGAGCTTCCAGAGATCAGGGAGTTGTAGGTATTAATACATTGCCCACCTCTCTTGGTGCCCAGTTCAGGGCTGTCTCATGGGCGCTTGGTCCATATTGTTGACATCTGTAAGCAAGCCGTGACAGTGCTTTGGCTCCAGGCAGGCCTGAATTGTCCAGGGGAAAGTATAATTCTCTCCCTGGATCCTTTAAATGGTCCAAGTAATGAGAAGCAGAACATAGGATCAGTCTGTTAACCCCTTATATGTGTTACACATTTGACAGAGTGCTTTTACGTCTGTTTTCTCCTTCAATTTTCCCCAACATTTCCGCAAGGCCCAGAAAGCAAATGAAATTGTCCCCATTCTCATAGACAGGGAAATAAGCTCAGGTTGGCTAAGGCTTAGAGAGGCCACATCATTAGTAAATAGCCCAGATCTTTGGACTGATAGTCTAACACCGTTTCCACCAGACCCGAACTAACCTCTCCAAGGCTGACTCCTGACTTGGCCACAATCACCAGAGCATGTAAAGGCCTCACCCTACAATTCTTAGCATTGCCCTGTCTATTGTCTTAAAATGTTCAGTGTTGCAAACTTTGCATGGCACCTGTTAGACATATAATCTGAATTATGTATATCTGAGGGCATTCAGGGGATACCAAAAAGCTGCTATCACTGAAGCCTCTTAAGAAATTATAAACTCTTTATGATGCTCTATTGGGTTCTCTGCCAAGGAAACCAGGCATACCTGCACCTTGCCCTCTGGGATCTTATAATCAGCAGATTTGCTTATAAATTGTAGCAAATTTGGAGCCAGGCACAGTGGTGCGTGTCAGCTACTCAAGAGGCTGAGGCAGGAGAAATGCTTAAGCTCAGGAGCTTGAGTCTAGCCTGGGCCACATAGCAAGACCTTGTCTCTAAAAATAAAAAATAAAAATTGCCAGGCGTAATGGCTCACACCTGTAATTCTAGCATTTTGGGAGGCTGAGGCAGTTGGATCACTTGAGCCCAGGAGTTTGAGACCAGCCTGGGAAATATGGCGAAACCCCAGCTCTACAAAAAGTACAGAGATTAGCTGGGCGTGGTGGTCTGTGCCTGTGTAGTCCCAGCTACTTGGAAGGTGTAGGTGGGAGGATCAACTGAGCCCAGGAGGTCAAAGCGGCTACAGTGAGCTGTGATCTTACCACTGCACTTCAGCCTGGGCAACATGTGACCCTGTCTCAAAATACATAAATAAAAATTGTAGCAAATTGGAGTAGGAGAGGTCATATAAAAGACCACTTGTGGCCAGGTGCGGTGGCTCACACCTGTAATCCCAGCACTTTGGGAGGCTGAGGCAGGTAGATCACCTGAGGTCAGGAGTTTGAGACCAGCCTAACATGGTGACACCCTGTCTCTACTAAAAATACAAAACAGCTGGGTGTGGCGGCGCGTGCCTGTAATCCCAGCTACTCAGGAGGCTGAGGCAGGAGAATTGCTTGAATCTGGGAGGCAGAGGTTGTAGTGAGCCGAGATTGTCCCATTGTACTCCAGCCTAGGCAACAAGAGCAAAAACCTGTCTCAAAAAAAAAAAAAAACAAAAAAAAAACACTTGTTTTCCTACAGTGGTTTTTATTTTTAACTCCAGTGTTTGTCCCCTACCCTAAGATTTACAGAATGTCAACATCACACTGCGCATCCTCTTCCGGCCTGTCGCCAGCCAGCTTCCTCGCATCTTCACCAGCATCGGAGAGGACTATGATGAGCGTGTGCTGCCGTCCATCACAACTGAGATCCTCAAGTCAGTGGTGGTGAGTGAACAGGGGCCTTTAGCCTCGAGCCCAGAGCACCACCCTGGGAGGGTGCCAGGTGGCAGGAAGCGCTTGGCAGTGGGTTGGTTGGGATGTGGCTGCTAGTTTCCTGGTTCCTTTTCTGCTTCCTCATTAACCTGACCTGCCCTTCTGCTCCTCCCTTTGAAACCAGGCTCGCTTTGATGCTGGAGAACTAATCACCCAGAGAGAGCTGGTCTCCAGGCAGGTGAGCGACGACCTTACAGAGCGAGCCGCCACCTTTGGGCTCATCCTGGATGACGTGTCCTTGGTAAGATCCTTCGGGAGACCGAGGAGGGGAAGGGGCTGCAGTTCTCGTTTAGGTGCCTGGCTCCATTTCTGGGTAGACGCTATTAGGTCCTCCCTTCTGCTTTGCTAGATGTGAGACTTGAAAACACGGAAACATGCTGAGGTGAGGCAGTCTCCGTGGGTTTTTCAGTTGAGGGTTCTTTTACCTTCCCCCTGCCACACACATTTTTCTTATGACCTCTGGTTGTATCCAGATAGTCTCTAACCACTAAATGTTTTACCTTCTCCAAACTGTTACCCAGAGAGTGATGCCTTGTTAACCCTGTTTGACACAGGCAGAAACTGCCTGGTAGAGACCAGAGAACAGCTCGGGTAGTCCTTCTCCCTAGCACAGACCTCCCAGCCTGACTCCTGGGAGCTTCCTAACACTTTACAGTCCGAAGCTCAGTGAAGTAAGCTCTGGGAACCCCAGTGAAAGGTGATAGAGTGTAAACGAACGGTTGGATTCCCCCAGGCCTGGTATAGGGGGCAAGGGACATCTCTGAGGCGTAAGCTATCCTCTTGAGACACTATAGCTTGTGTGTTTATATGACATTGGATGTCATAACTCAGAAAGCAATGCAGGCAGGATAGCGTTTCAGGTTGAGGAGGGTGAGGGGAAGGGGTCGTGTTTCTAGATTCTCTGGGAAAAACCATTTGGAGTGATTTGTTCGGGCAGTGAGGTAAAGTGTTTCCTGTTCAGTTCTCCCGTGCATTGCTAGGGAAAGGCACTGCCTCCCCCGGCATCTGTGCAGCTGTTTAAACAGCCACTTGACAACACCCAGTGCTAACCCCTGGGCACTGCTCCACCTTGCTCCGCCTGCTGGAAGTCCTGGGGGCTTGGGGCTCCCTCTGCTGGCAAGAGGCCAGGCTGCAGCCATTCTGTGGGCCCTTCCCTCGTAATTACCGTTAACCTGAACACCTTGGCTGTGAGAAAACGCTGAGTAAAAACCTAAGGGAAAAGTTGGCATTTTACTAGACTTTAACCACATACTCCATTCTGGGGAAATGTGGGCTGACCACAAGAAACCCTATCTAAGGTGTGAGAAGAAAATTAGGTTTCATGGGGAATTTGCTGCCCTCAGCTGGCCCTTGTAGAAATAAATTTTATTCCTTAATTATACATTTCATTTTTCATGTCTCAGGATCAGATTTTCTTACCCAAACTTTGACTAAGAAACTAGAAATGGATCAGGCGAAGTACAACAGCTGCAGTTAGAGTTAGGAGGTTAAAAATTCTGGAAGAGAATGAGACCAGGATTTACTCTTCAGGAGAAGTTTGGAGCTGCTTTTTTAAAAAGCAGAGGTTGGGAGAGTGGAGGAAATGAAACAACTAGAATTTGATGGCAAAACCAATGCTCTTCTCTTGATTCTTTTTCAATAAAAATTAGGATGAATAAGTAAATTGCTCTAGGCTGGGCATTAGGAATTCTGAGGTCCCTTCCCATTGTTTGTGCTATCTTTAGGGACAGTCCCGTCAGACCTGACATGATTAGGGAAGGTCTATGGATCATCAGACTTCTAAACCCTCATGCCGACCAATGACTTTACCTGCTTTCTCTTTCTTTTAAACCGTTTAACAGAACCATTCACATTGGGAATACCATGATTTGCGTTCCACCGTCCCTCGTCCTCTTCCTGTTCAGTGGTGGAGCTGCTGTGGGAAGACGCGGACTGGTTAATCCATAAACAGAGAGCATCAGGCTCTTGGATCCCTGGGAACCAGCTGCCTCCCTCACTCTCAGGGACCCTGTTTTCCATCTGGCCTTCCTTGGGCTTTGAACAAGGCATCAAAGGCCCTTGGAAGAGCACTAGTCAGTGGCGGGGGTCTTAGAACCCACAGTTCTCCTCCTCTGGGGAGGTGGTCGATTGAGTAGATACCTTCTGGTGCCTGTGGGCCCCATCAAAAGCCCCCGGTGCCATTTGCTACATGAGGTCACTGTACTGAGAGTGACAGAGTAATATACAGGAGCAGTTTGGGCAGCCAGAGAGTCTGGGTGTAAACTCAGTTTGGATACAGATACGGAGGTGGAAGAGTGTTCTGGCCTCACGGATGCCTCCAGCTGCTAGAGCCATTGCTGGCCTCTTCTTCCAGCGGCCATGGAGCCCTCCCAGCAGTGCTGTCGAAGCAATCACACTGCCTCATCTTGTGCTCACTCTCTCCCCTTAGACACATCTGACCTTCGGGAAGGAGTTCACAGAAGCGGTGGAAGCCAAACAGGTGGCTCAGCAGGAAGCAGAGAGGGCCAGATTTGTGGTGGAAAAGGTGAGCCTTCGACCAGATGGCAGGAGCCTCTCTCTCCCCTTTCTCCGGCACTCAGCTTCCCCATTTGCTGGGTGGCCTGGAAATTCATCATCTGTCATCCCTTCTTCCGGGATAATCAGAAGGGGCTTGAAGGAATTGTACTTCTGCAATTGGTTCCAGAGTCTTCAGGGGCTAGTCAAGGATATGTGGAGTTATGTTCCTAAATCACTGAAGGGTAATTTTTCTTCCACTTCTCTGAGATCAAAAACACTCTCTTACAAATAAAAATGTTTCTCCTGGAGTATTTTCAGCTTCACTGAGAAGTCATTTTTAACCATAGTTACATAGTGAAAGCTGACAGCAAAAAAGATCAAACGTTGCACCAGATGTGCTTTCGTCACTAGATTTTTTTCTAGTGCTAAATCCATCCAGATGTGTCAAAGAATGTGATGGGACACAGTGTATTTGCGTAGCAGCCTGGTCTTTCTGGTATTTGCAAAGACATGTTCATTTATTGTTGTCCCCTTCTTCCCACCACCAGTATCCCTAATTGGTGGGGAGATGGGGACAGCAAGAAATAAAATGGGAAAAGAGGGATAGATTTAATTTTGGAGAATGAAAACACTGTGTGGGCAGAGACTTGTGTTGCTTTGTATCTGCCATAACTTCAGAGATTATAATAAGTCTAGTACAGTGCCTGGTGATAGTAGGTATACAGTAAATGTTTGTTGAGCAAATAGACGCAGGGCCCAGTCATTTCAAAATTGTATGTAATTTCAGGGAGGCTTAATACTGTCTTCTTCCTCACACTCCTGAAGGTCACACGTTGCAGAGAGCTGTCTTCCTATTGATATTGGTAGGGCAAGCCTAGGAGATCTCACTCTGGGTGCCTGGATTCTGGTCAGGAACCAGCCTAACTCACAGGCAGCTCTAGGAACAGTCAAAAGTGCATGCTGCTCTTCCTTAGCCATCCCGAGGTTTTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTGACAGCTCTGTGGCCCAGGCTGGAGTGCAATGGCATGATCATAGCTCACTGCAGCCTTGGCCTCCTGGGCTCAAGTGATCCTCCTGCCTCCGCCTCCCAAAGTGCCAGGATTACAGGCATGAGCCACCACACCCGGCCCTGTCCTGGCTTTGATGAAGTCCTTTAGACTTAAGGCTGGAGGAAAAGATGAGCCTTGAGGATTGATTCCACCTTTCTTTTGCTTCTGTTTTCCTTGGCCTTGGCTTCTCCTGGCTCAGAGTAGGGTTGTTAAACTAGATTGCAATTAATATTAATGAGGACTTTGAAATAAGACAAATATTCCTGCAGCCAACAGAGATGTATCCCTCCCGTGACAAGGAGTGAGCATGAAAGGATAGGGGAGGACTGGTGGGCAATGTGCTCTGCTTCCCCCCGCTTCCCCCGCTAGCCATCAGGAGGAAGTAAACTCCCCGAGTTCCTTCAGGAGCCTGGGAAGGTGGCTTTCTGGTGAAGGGCCTTTGGTTGTAGCCTGACATGCGGTGCCCTGAGGTTTGATCTTTGTCTCCACCTCCATTCTTTTAGGCTGAGCAACAGAAAAAGGCGGCCATCATCTCTGCTGAGGGCGACTCCAAGGCAGCTGAGCTGATTGCCAACTCACTGGCCACTGCAGGGGATGGCCTGATCGAGCTGCGCAAGCTGGAAGCTGCAGAGGACATCGCGTACCAGCTCTCACGCTCTCGGAACATCACCTACCTGCCAGCGGGGCAGTCCGTGCTCCTCCAGCTGCCCCAGTGAGGGCCCACCCTGCCTGCACCTCCGCGGGCTGACTGGGCCACAGCCCCGATGATTCTTAACACAGCCTTCCTTCTGCTCCCACCCCAGAAATCACTGTGAAATTTCATGATTGGCTTAAAGTGAAGGAAATAAAGGTAAAATCACTTCAGATCTCTAATTAGTCTATCAAATGAAACTCTTTCATTCTTCTCACATCCATCTACTTTTTTATCCACCTCCCTACCAAAAATTGCCAAGTGCCTATGCAAACCAGCTTTAGGTCCCAATTCGGGGCCTGCTGGAGTTCCGGCCTGGGCACCAGCATTTGGCAGCACGCAGGCGGGGCAGTATGTGATGGACTGGGGAGCACAGGTGTCTGCCTAGATCCACGTGTGGCCTCCGTCCTGTCACTGATGGAAGGTTTGCGGATGAGGGCATGTGCGGCTGAACTGAGAAGGCAGGCCTCCGTCTTCCCAGCGGTTCCTGTGCAGATGCTGCTGAAGAGAGGTGCCGGGGAGGGGCAGAGAGGAAGTGGTCTGTCTGTTACCATAAGTCTGATTCTCTTTAACTGTGTGACCAGCGGAAACAGGTGTGTGTGAACTGGGCACAGATTGAAGAATCTGCCCCTGTTGAGGTGGGTGGGCCTGACTGTTGCCCCCCAGGGTCCTAAAACTTGGATGGACTTGTATAGTGAGAGAGGAGGCCTGGACCGAGATGTGAGTCCTGTTGAAGACTTCCTCTCTACCCCCCACCTTGGTCCCTCTCAGATACCCAGTGGAATTCCAACTTGAAGGATTGCATCCTGCTGGGGCTGAACATGCCTGCCAAAGACGTGTCCGACCTACGTTCCTGGCCCCCTCATTCAGAGACTGCCCTTCTCACGGGCTCTATGCCTGCACTGGGAAGGAAACAAATGTGTATAAACTGCTGTCAATAAATGACACCCAGACCTTCCGGCTCA"
)
print(my_seq)
tamanho_seq = len(my_seq)
print(tamanho_seq)

#Percentagem de cada nucleótido na sequencia
A = (my_seq.count("A") / len(my_seq)) * 100
print("A: ", A)
T = (my_seq.count("T") / len(my_seq)) * 100
print("T: ", T)
G = (my_seq.count("G") / len(my_seq)) * 100
print("G: ", G)
C = (my_seq.count("C") / len(my_seq)) * 100
print("C: ", C)

#Ficheiro GenBank
phbgb = SeqIO.read(
    "C:/Users/Zé Freitas/Desktop/Mestrado/Labs_Bioinf/Trabalho prático/scripts/Labs_Bioinf/PHB/sequence.gb",
    "genbank")
phbgb

id = phbgb.id
コード例 #28
0
my_seq3 = Seq("AGTACACTGGT", IUPAC.extended_dna)
print(my_seq3.alphabet)

my_prot = Seq("AGTACACTGGT", IUPAC.protein)
print(my_prot)
print(my_prot.alphabet)

##seq object act like string
for index, letter in enumerate(my_seq):
    print("%i %s" % (index, letter))

print(len(my_seq))
print(my_seq[0])  #first letter
print(my_seq[-1])  #last letter

print(my_seq.count("AC"))

##GC content
print(100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq))

from Bio.SeqUtils import GC
print(GC(my_seq))

##Slicing a sequence
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
print(my_seq[4:12])
print(my_seq[0::3])
print(my_seq[1::3])
print(my_seq[::-1])  #reverse
fasta_format_string = ">Name\n%s\n" % my_seq
print(fasta_format_string)
コード例 #29
0
ファイル: INI.py プロジェクト: selinj/rosalind
from Bio.Seq import Seq

with open('rosalind_ini.txt','r') as handle:
    dna = Seq(handle.readline())

with open('01_INIout.txt','w') as out:
    out.write(str(dna.count("A")) + " " + str(dna.count("C")) + " " + str(dna.count("G")) + " " + str(dna.count("T")))

out.close()
コード例 #30
0
ファイル: 1_INI.py プロジェクト: ColtonMak/Rosalind
# Introduction to the Bioinformatics Armory

from Bio.Seq import Seq

seq = Seq(input())
a = str(seq.count("A"))
c = str(seq.count("C"))
g = str(seq.count("G"))
t = str(seq.count("T"))

print(a + " " + c + " " + g + " " + t)
コード例 #31
0
ファイル: rosalind_Standard.py プロジェクト: kjh918/rosalind
'''
dna = open('rosalind_ini.txt','r')

dna = open('rosalind_ini.txt','r')

for i in dna:
    print(i.count('A'), i.count('C'), i.count('G'),i.count('T'))
    print(len(i))
dna = 'AGCTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'
'''

from Bio.Seq import Seq
txt = open('rosalind_ini.txt', 'r')
for i in txt:
    x = Seq(i)
    print(x.count('A'), x.count('C'), x.count('G'), x.count('T'))

dic = {'A': 0, 'T': 0, 'G': 0, 'C': 0}
dna = 'AGCTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC'
for i in dna:
    if i in dic.keys():
        dic[i] += 1
print(dic)

# 2. Transcribing DNA into RNA
# DNA -> RNA (T -> U)

dna = 'GATGGAACTTGACTACGTAAATT'
rna = open('rosalind_rna.txt', 'r')
for i in rna:
    rna = i.replace('T', 'U')
コード例 #32
0
def main(argv):
    line = files.read_line(argv[0])
    seq  = Seq(line)

    print seq.count('A'), seq.count('C'), seq.count('G'), seq.count('T') 
コード例 #33
0
from Bio.Seq import Seq

dna = Seq(
    "AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC")

print(dna.count("A"), dna.count("C"), dna.count("G"), dna.count("T"))
コード例 #34
0
ファイル: biopython.py プロジェクト: alanroche/pycam
from Bio.Seq import Seq
my_seq = Seq("AGTACACTGGT")
print my_seq
print my_seq[10]
print my_seq[1:5]
print len(my_seq)
print my_seq.count("A")

from Bio.SeqUtils import GC, molecular_weight
print "GC: ", GC(my_seq)
print molecular_weight(my_seq)

from Bio.Alphabet import IUPAC
my_dna = Seq("AGTACATGACTGGTTTAG", IUPAC.unambiguous_dna)
print my_dna
print
print my_dna.alphabet
print my_dna.reverse_complement()

print my_dna.translate()
コード例 #35
0
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
import pyperclip

my_seq=str(pyperclip.paste())


my_seq = Seq(str(pyperclip.paste()), IUPAC.unambiguous_dna)
resultA = (my_seq.count("A"))
resultC = my_seq.count("C")
resultG= my_seq.count("G")
resultT = my_seq.count("T")
result = resultA + resultG + resultC + resultT

GC_content = (int(resultC + resultG)/(int(result)))*100
print(GC_content)
pyperclip.copy(GC_content)
コード例 #36
0
#!/usr/bin/python3"
import Bio
from Bio.Seq import Seq

dna1 = Seq("ATGACGACCAGTGACGATGACGTTTGACCAGAT")
dna2 = Seq("GGGACCAGACCAGATGACCAGATGACAGATGACAGACAGAT")

#Method 1: manual calculation of the gc percentage
gc_dna = dna1.count('G') + dna1.count('C')
dna1_len = len(dna1)
gc_per = (gc_dna / dna1_len) * 100
print("The GC-Content in percentage :", gc_per, "%")


#Method 2: calculation of the GC percentage using function
def gc_content(x):
    gc_dna = x.count('G') + x.count('C')
    dna_len = len(x)
    gc_per = (gc_dna / dna_len) * 100
    return gc_per


print("The GC percentage using Self-Defined function:", gc_content(dna1), "%")

#Method 3: calculation of GC percentage using Bio python module
from Bio.SeqUtils import GC
gc = GC(dna1)
print("GC % in Bio Python:", gc, "%")

#Suppose we want the GC percent in rounded form
#Method 1
コード例 #37
0
ファイル: rosalind.py プロジェクト: kevinychen/rosalind
def dna(s):
    seq = Seq(s)
    print seq.count('A'), seq.count('C'), seq.count('G'), seq.count('T')
コード例 #38
0
def readfile(filename):
    with open(filename) as file:
        content = Seq(file.read().strip().replace("\n", ""))
    return content.count("A"), content.count("C"), content.count(
        "G"), content.count("T")
コード例 #39
0
from Bio.Seq import Seq


def count_nucleotides(s):
    return s.count("A"), s.count("C"), s.count("G"), s.count("T")


def count_nucleotides_2(s):
    freq = {'A': 0, 'C': 0, 'G': 0, 'T': 0}
    for i in s:
        freq[i] = freq[i] + 1
    return freq['A'], freq['C'], freq['G'], freq['T']


if __name__ == "__main__":
    s = "TCCTCGGGGACTTTCGCGCGACAACTGCGCAGATGTTGCGTGTAGGGCCGAAATATCCCCGCAACTTGAGAGCTCTGGAGACGCATTCACGAAACGCATCTAAGCTCGCGGTACAGAACGGTACGATTGGGCATCCGATATGCTGCATGTGATAGTGCGTTCGAAATCCTGCTGGGCCCGTGGCCATTGCTTCCAATCATCCGACCTAGTTTACCATGTCTGGCTTTGAACGGGAGGAACTCGCACACGAAAAAGCTACAGGCAGCCCCATCCGGATATCATCATGAGGGGGACTCTGGTCCATATTATAGTATTATCAATGTCATCTGTTAGTGCACGAGCTGTTTGTCTGCTGCTACGATCACCGACACGTTCGGTGGGGGGCCTTCGTTGTGGCCGTGAGATCAGCCCTGTGGCGCGTGGTCTAAGCTTTAGGCTAGCTAGGAGTGGCACGCTCGTTGGACCAATCGATTGAATACCTTGCCCATATGTCTGGACAAGGTGTGGGGGACGGTCCGGCGTGCCCTAGATCTGTCATATACGGCTTTGATCTCTTCATCTTCTCAGTCTATTAGTGGTCTATACATTCTAACCCCATTTAGTCCCTTGTGGAACTATACTTGGAATGAAGCACTCATCTGGAGGGGGGTGATGTCTTCCTTCGCCTGCCGAGAAACACTATACGTCGATCGCCCCCGCGTCGCCCCCTCATATTTAACCTGCATTTTTTTCCCCATAGGGAACACCGAAATTACGAGTGGCACACACGTACTATATTCGATGTGTAGTCCGGTTGCATTCGCCCGTGGACCA"
    my_seq = Seq(s)
    print my_seq.count("A"), my_seq.count("C"), my_seq.count(
        "T"), my_seq.count("G")
    print ' '.join(map(lambda x: str(my_seq.count(x)), ['A', 'C', 'T', 'G']))
コード例 #40
0
"""
The Seq object has a number of methods which act just like those of a Python
string (For example, the find and count methods).
"""

#rom Bio.Seq import Seq
#from Bio.Alphabet import generic_dna
my_dna = Seq("AGTACACTGGT", generic_dna)
print my_dna
#Seq('AGTACACTGGT', DNAAlphabet())
my_dna.find("ACT")
#5
my_dna.find("TAG")
#-1
my_dna.count("GG")
#note that count is non-overlapping
"AAAAAAA".count("AA")


"""
BioPython has several built-in functions for biological applications:
complement, reverse complement, translation, back translation
"""

#from Bio.Seq import Seq
#from Bio.Alphabet import generic_dna
#my_dna = Seq("AGTACACTGGT", generic_dna)
print my_dna
my_dna.complement()
#Seq('TCATGTGACCA', DNAAlphabet())
コード例 #41
0
ファイル: 318_test1.py プロジェクト: cgregg/codonmassager
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
from Bio.Alphabet import generic_rna
from Bio.Alphabet import generic_protein

my_dna = Seq("ATGGGGAGAAGGCCGTAG", generic_dna)
#print my_dna

#a = my_dna + 'aaa'
#print a

print my_dna.find('AGG')
print my_dna.find('AGA')
print my_dna
print my_dna.count('A')
print len(my_dna)

your_dna = my_dna.complement()
print your_dna
my_rna = my_dna.transcribe()
print my_rna

my_protr = my_rna.translate(table=1, to_stop=True) 
#table = 1 is default std genetic code, http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi#SG1
#to_stop=True tells it to stop at stops
print my_protr
my_protd = my_dna.translate(to_stop=True)
print my_protd

#playing with complete CDS'
#yaaX = Seq("GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGCTCCCATGGCA" + \
コード例 #42
0
ファイル: ProtParam.py プロジェクト: Mat-D/biopython
class ProteinAnalysis:
    """Class containing methods for protein analysis.

    The class init method takes only one argument, the protein sequence as a
    string and builds a sequence object using the Bio.Seq module. This is done
    just to make sure the sequence is a protein sequence and not anything else.
    
    methods:
    
    count_amino_acids:
    
    Simply counts the number times an amino acid is repeated in the protein
    sequence. Returns a dictionary {AminoAcid:Number} and also stores the
    dictionary in self.amino_acids_content.
    
    get_amino_acids_percent:
    
    The same as count_amino_acids only returns the Number in percentage of entire
    sequence. Returns a dictionary and stores the dictionary in
    self.amino_acids_content_percent.
    
    molecular_weight:
    Calculates the molecular weight of a protein.
    
    aromaticity:
    
    Calculates the aromaticity value of a protein according to Lobry, 1994. It is
    simply the relative frequency of Phe+Trp+Tyr.
    
    
    instability_index:
    
    Implementation of the method of Guruprasad et al. (Protein Engineering
    4:155-161,1990). This method tests a protein for stability. Any value above 40
    means the protein is unstable (=has a short half life). 
    
    flexibility:
    Implementation of the flexibility method of Vihinen et al. (Proteins. 1994 Jun;19(2):141-9).
    
    isoelectric_point:
    This method uses the module IsoelectricPoint to calculate the pI of a protein.
    
    secondary_structure_fraction:
    This methods returns a list of the fraction of amino acids which tend to be in Helix, Turn or Sheet.
    Amino acids in helix: V, I, Y, F, W, L.
    Amino acids in Turn: N, P, G, S.
    Amino acids in sheet: E, M, A, L.
    The list contains 3 values: [Helix, Turn, Sheet].
    
    
    protein_scale(Scale, WindwonSize, Edge):
    
    An amino acid scale is defined by a numerical value assigned to each type of
    amino acid. The most frequently used scales are the hydrophobicity or
    hydrophilicity scales and the secondary structure conformational parameters
    scales, but many other scales exist which are based on different chemical and
    physical properties of the amino acids.  You can set several  parameters that
    control the computation  of a scale profile, such as the window size and the
    window edge relative weight value.  WindowSize: The window size is the length
    of the interval to use for the profile computation. For a window size n, we
    use the i- ( n-1)/2 neighboring residues on each side of residue it compute
    the score for residue i. The score for residue is  the sum of the scale values
    for these amino acids,  optionally weighted according to their position in the
    window.  Edge: The central amino acid of the window always has a weight of 1.
    By default, the amino acids at the remaining window positions have the same
    weight, but  you can make the residue at the center of the window  have a
    larger weight than the others by setting the edge value for the  residues at
    the beginning and end of the interval to a value between 0 and 1. For
    instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7,
    1.0, 0.7, 0.4.  The method returns a list of values which can be plotted to
    view the change along a protein sequence.  Many scales exist. Just add your
    favorites to the ProtParamData modules.
    """
    def __init__(self, ProtSequence):
        if ProtSequence.islower():
            self.sequence = Seq(ProtSequence.upper(), IUPAC.protein)
        else:
            self.sequence = Seq(ProtSequence, IUPAC.protein)
        self.amino_acids_content = None
        self.amino_acids_percent = None
        self.length = len(self.sequence)
        
    def count_amino_acids(self):
        ProtDic = dict([ (k, 0) for k in IUPACData.protein_letters])
        for i in ProtDic.keys():
            ProtDic[i]=self.sequence.count(i)
        self.amino_acids_content = ProtDic
        return ProtDic
    
    """Calculate the amino acid content in percents.
    input is the dictionary from CountAA.
    output is a dictionary with AA as keys."""
    def get_amino_acids_percent(self):
        if not self.amino_acids_content:
            self.count_amino_acids()
                
        PercentAA = {}
        for i in self.amino_acids_content.keys():
            if self.amino_acids_content[i] > 0:
                PercentAA[i]=self.amino_acids_content[i]/float(self.length)
            else:
                PercentAA[i] = 0
        self.amino_acids_percent = PercentAA
        return PercentAA

    # Calculate MW from Protein sequence
    # Calculate MW from Protein sequence
    def molecular_weight (self):
        # make local dictionary for speed
        MwDict = {}
        # remove a molecule of water from the amino acid weight.
        for i in IUPACData.protein_weights.keys():
            MwDict[i] = IUPACData.protein_weights[i] - 18.02
        MW = 18.02 # add just one water molecule for the whole sequence.
        for i in self.sequence:
            MW += MwDict[i]
        return MW

    # calculate the aromaticity according to Lobry, 1994.
    # Arom=sum of relative frequency of Phe+Trp+Tyr     
    def aromaticity(self):
        if not self.amino_acids_percent:
            self.get_amino_acids_percent()
        
        Arom= self.amino_acids_percent['Y']+self.amino_acids_percent['W']+self.amino_acids_percent['F']
        return Arom

    # a function to calculate the instability index according to:
    # Guruprasad K., Reddy B.V.B., Pandit M.W.    Protein Engineering 4:155-161(1990).
    def instability_index(self):
        #make the dictionary local for speed.
        DIWV=ProtParamData.DIWV.copy()
        score=0.0
        for i in range(self.length - 1):
            DiPeptide=DIWV[self.sequence[i]][self.sequence[i+1]]
            score += DiPeptide
        return (10.0/self.length) * score
        
    # Calculate the flexibility according to Vihinen, 1994.
    # No argument to change window size because parameters are specific for a window=9. 
    # the parameters used are optimized for determining the flexibility.
    def flexibility(self):
        Flex = ProtParamData.Flex.copy()
        Window=9
        Weights=[0.25,0.4375,0.625,0.8125,1]
        List=[]
        for i in range(self.length - Window):
            SubSeq=self.sequence[i:i+Window]
            score = 0.0
            for j in range(Window/2):
                score += (Flex[SubSeq[j]]+Flex[SubSeq[Window-j-1]]) * Weights[j]
            score += Flex[SubSeq[Window/2+1]]
            List.append(score/5.25)
        return List

    # calculate the gravy according to kyte and doolittle.
    def gravy(self):
        ProtGravy=0.0
        for i in self.sequence:
            ProtGravy += kd[i]
            
        return ProtGravy/self.length

    # this method is used to make a list of relative weight of the
    # window edges compared to the window center. The weights are linear.
    # it actually generates half a list. For a window of size 9 and edge 0.4
    # you get a list of [0.4, 0.55, 0.7, 0.85]. 
    def _weight_list(self, window, edge):
        unit = ((1.0-edge)/(window-1))*2
        list = [0.0]*(window/2)
        for i in range(window/2):
            list[i] = edge + unit * i
        return list
    
    # this method allows you to compute and represent the profile produced
    # by any amino acid scale on a selected protein.
    # Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl
    # The weight list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4]
    # what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done
    # in the loop.
    def protein_scale(self, ParamDict, Window, Edge=1.0):
        # generate the weights
        weight = self._weight_list(Window,Edge)
        list = []
        # the score in each Window is divided by the sum of weights
        sum_of_weights = 0.0
        for i in weight: sum_of_weights += i
        # since the weight list is one sided:
        sum_of_weights = sum_of_weights*2+1
        
        for i in range(self.length-Window+1):
            subsequence = self.sequence[i:i+Window]
            score = 0.0
            for j in range(Window/2):
                # walk from the outside of the Window towards the middle.
                # Iddo: try/except clauses added to avoid raising an exception on a non-standad amino acid
                    try:
                        score += weight[j] * ParamDict[subsequence[j]] + weight[j] * ParamDict[subsequence[Window-j-1]]
                    except KeyError:
                        sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' %
                                 (subsequence[j],subsequence[Window-j-1]))

            # Now add the middle value, which always has a weight of 1.
            if subsequence[Window/2] in ParamDict:
                score += ParamDict[subsequence[Window/2]]
            else:
                sys.stderr.write('warning: %s  is not a standard amino acid.\n' % (subsequence[Window/2]))
        
            list.append(score/sum_of_weights)
        return list

    # calculate the isoelectric point.  
    def isoelectric_point(self):
        if not self.amino_acids_content:
            self.count_amino_acids()
        X = IsoelectricPoint.IsoelectricPoint(self.sequence, self.amino_acids_content)
        return X.pi()
        
    # calculate fraction of helix, turn and sheet
    def secondary_structure_fraction (self):
        if not self.amino_acids_percent:
            self.get_amino_acids_percent()
        Helix = self.amino_acids_percent['V'] + self.amino_acids_percent['I'] + self.amino_acids_percent['Y'] + self.amino_acids_percent['F'] + self.amino_acids_percent['W'] + self.amino_acids_percent['L']
        Turn = self.amino_acids_percent['N'] + self.amino_acids_percent['P'] + self.amino_acids_percent['G'] + self.amino_acids_percent['S']
        Sheet = self.amino_acids_percent['E'] + self.amino_acids_percent['M'] + self.amino_acids_percent['A'] + self.amino_acids_percent['L']
        return Helix, Turn, Sheet
コード例 #43
0
ファイル: __init__.py プロジェクト: asafpr/pro_clash
def get_unmapped_reads(
    samfile, outfile1, outfile2, length, maxG, rev=False, all_reads=False,
    dust_thr=0):
    """
    Get the list of unmapped paired reads and write the reads (mate 1 and 2) to
    the fastq files outfile1 and outfile2. The names of the reads is the same
    (assume equal in bam file)
    If rev is set assume first read is the reverse complement and reverse
    complement it, put it as read 2 and treat the second read as read 1.
    Can handle single-end as well.
    If all_reads is True, returnt ha names of the reads that are mapped.
    Arguments:
    - `samfile`: Open Samfile object
    - `outfile1`: Open fastq file for reads 1
    - `outfile2`: Open fastq file for reads 2
    - `length`: Write the first X nt of the sequences
    - `maxG`: Maximal fraction of G's in any of the reads
    - `rev`: Reads are reverse complement (Livny's protocol). Has no influence
             on single-end reads
    - `all_reads`: Return all reads, including mapped ones
    - `dust_thr`: DUST filter threshold. If=0, not applied.
    """
    single_mapped = set()
    for read in samfile.fetch(until_eof=True):
        if (not read.is_paired) and (read.is_unmapped or all_reads):
            if read.is_reverse:
                # This can't happen unless all_reads is set to True
                reverse_seq = True
            cseq = read.seq
            cqual = read.qual
            if reverse_seq:
                cseq = str(Seq(cseq).reverse_complement())
                cqual = cqual[::-1]
            if all_reads and (not read.is_unmapped):
                single_mapped.add(read.qname)
            if cseq.count('G', 0, length) >= int(maxG*length) or\
                    cseq.count('G', -length) >= int(maxG*length):
                continue
            outfile1.write("@%s\n%s\n+\n%s\n"%(
                    read.qname, cseq[:length],
                    cqual[:length]))
            outfile2.write("@%s\n%s\n+\n%s\n"%(
                    read.qname, cseq[-length:],
                    cqual[-length:]))
            continue
        if (all_reads or read.is_unmapped or read.mate_is_unmapped or\
                (not read.is_proper_pair)) and read.is_paired:
            if all_reads and not (read.is_unmapped or read.mate_is_unmapped or\
                (not read.is_proper_pair)):
                single_mapped.add(read.qname)
            if read.is_read1==rev:
                ouf = outfile2
                outseq = Seq(read.seq)
                outqual = read.qual[-length:]
                # Reverse complement the read if it haven't been
                # done in the bam file. Otherwise, do nothing
                if not read.is_reverse:
                    outseq = outseq.reverse_complement()
                    outqual = read.qual[::-1][-length:]
                if (str(outseq).count('C')>=int(maxG*length)):
                    continue
                outseq = str(outseq[-length:])
            else: # First read in the fragment
                ouf = outfile1
                outseq = Seq(read.seq)
                outqual = read.qual[:length]
                if read.is_reverse:
                    outseq = outseq.reverse_complement()
                    outqual = read.qual[::-1][:length]
                if outseq.count('G') >= int(maxG*length):
                    continue
                outseq = str(outseq[:length])
            # test if read passes DUST filter
            if pass_dust_filter(outseq, dust_thr):
                ouf.write("@%s\n%s\n+\n%s\n"%(read.qname, outseq, outqual))
    return single_mapped
コード例 #44
0
def basecount(dna):
    sequence = Seq(dna)
    print str(sequence.count("A")) + " " + str(
        sequence.count("C")) + " " + str(sequence.count("G")) + " " + str(
            sequence.count("T"))
コード例 #45
0
ファイル: INI.py プロジェクト: kub4/rosalind
Sample Dataset
--------------

AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTCTGATAGCAGC

Sample Output
-------------

20 12 17 21

"""
import sys
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna

# extract dna from the file, use the biopython sequence object
# and define its alphabet just for the joy of it
with open(sys.argv[1], 'r') as in_file:
  dna = Seq(''.join(in_file.read().upper().split()), generic_dna)

# define the order of bases in the output and create a counter list
baseorder = "ACGT"
basecount = []

# count the bases
for base in baseorder:
  basecount.append(dna.count(base))

# print the results
print(" ".join([str(x) for x in basecount]))
コード例 #46
0
from Bio.Seq import Seq
my_seq = Seq(
    "CTGTTGCAGGTCGGCTAGGCATTTGACATTGCACTCATCAAGACACACGGATAGGGGCTTGTGAGGTGCTCTAACAGCTGGGGCTCCAAGTGATAAAACCGTGTAAACACAGGACTGGCAGCTATCACCGACAGTGGCTTAGTATGATCTCAGTCGTATTTACGTCTCGGACACCCCGAACGTACTGACTCCGCTTAATTTAGGCCACCGCGGAGACGGAGAATTCCACATGACGTTGCACCGTATAAGGACGCGAAAGTAGTCCGCTGATTTGGACTCTCTCTCTTGAGCGTATAATCCGAAGTCTTTCGCTCAGCACATATTTTGCAACCTTTGAATACGGCCAGTTCGACGTCCGGATGTGCTTCTGATACTCCCCCTAGAGGAAAATAGAGCTGAGGTCAACTGCTAGCCAACCTGATCCTAAGGCAATACTGACCATGTCTCGTTCTTAGGTGATGGCGCACCTAAGCTCGTGCACTGTTGAAAGCAAGACTGATCACGCGGTGCAGAGTACAAGTGAGTTAACCCGCCTAAGCGCTGTTATGCACACGAGAAAATCATGACTTAGCTTAACTTAGAAGACCCAAGTGACGTTTCCTTGGGAGGAGTGTGCTCTACCACCTGGGTGTTGACGAGGGGGTTAATACCGCTTTTTTGTGTTCCATTACACGAGTAAGGCAACAGTAATCATGAATTCCTCGGTCAGCGGACAGTCAAGGTAGCAGTTCCTGTGTGTAATAATAACAGATGGTTGGACCTTGATTTGCATTCTTCGGCTGATCAGCATACACCATCTGGCTCTTTCATCATGCTCTACGGTCACCCAACAGAGAGTGTAGTTCGACTCGCGCGAGAGTAGCCAATCGCCGTCCTTTTCGAGTTGGGGTTACCCATAGAAAGAACCTATTAACGCTCTAACTGGGAGTTCGAAAAATATCGTAGACTCGCGACTGCTAAGCCGCTATCATAGTTGTGTGTTGTTCTTGAGGCG"
)
print "Count A ", my_seq.count("A")
print "Count C ", my_seq.count("C")
print "Count G ", my_seq.count("G")
print "Count T ", my_seq.count("T")
コード例 #47
0
from Bio.Seq import Seq
my_seq = Seq(
    "GATCGAATACGATGTCTGGAAACTATTCTGCTGTGAGGACTTGGACAAGAGCCATGAGAACATGACGTCCATAACCTAGCGGTATGGTAGTCACAATTGTCCACGCGGGCCCAATGCACGGTGGGATATGACGCCCTGTCAGGGTAGACCCCGGTTCGGTATACGTCGTTGTGAGTTAGGTAGCTAGGTTCTCACATTCTACAATGCATCTACCTGAAAGTGCAAGTGGAAAAGTCCGTTCGCTTCGGCGTCTGACACTACAACCCTAGAGCTTTCAATGGTGATTCCCGATAGAACTCGCTGCATGGGTCAATACGGTACCGGAACTACGAATGCGAGCTAGAGAGCTAACTTCTATAGCATCGAGATTTCGTTCCAGCACTAAGTGAGGAACGCCAGTCCCTGGCCATACCTTAGTACGGTTAACATGTCTGGAGCCAGTGGCCAGGTCGAGGAAATTACAATTGGTGTGGGTGAACCTCCAAACGCGGACTTACAGGTGCTGAACAACATCATTTTCTTGTAGTGCTCGCTGATTTTCAGTGCTTCAGTCGATCATTGGCGTCCTTCTACAGAACGGGTCGACAGGAAAGAATTAGGGAGCACGACTCCACGGGTCCGGGGTAAAGGGACATCCGACTAATGCATACCCGTCGTGAATGTTCCACGATAATCACGGGCCACTGATGTCCGTGTCTGGTCAGGGATATTATGGTTAGCGCTTGCCACTTCAGTTAATCGCAACACGCGGTGCGCCCTAGAGCATGTCATCTATAAATCACAGTTCATTCGCGTTTCGATCCTTCTTAGTTTGCATGAGAACTACTCTTCGCCTATTGTCAGAGTTGCAAGACCCAGA"
)
print(
    str(my_seq.count("A")) + " " + str(my_seq.count("C")) + " " +
    str(my_seq.count("G")) + " " + str(my_seq.count("T")))
from Bio.Seq import Seq
dnaSeq = Seq(open('rosalind_ini.txt','r').read().rstrip())

#we already have a set of expected values. count only these.
nucleotides = ['A','C','G','T']

#store the count per nucleotide in the same sequence
nucleotidesCount = []

#go through the list of nucleotides and count the number of occurences of the nucleotide in the DNA sequence
#DNA sequence must be formatted correctly once received (e.g. expect that data must contain only uppercase letters)
for i in nucleotides:
    nucleotidesCount.append(str(dnaSeq.count(i)))
print ' '.join(nucleotidesCount)
コード例 #49
0
ファイル: 03.code.py プロジェクト: guochangjiang/Python.learn
my_seq3 = Seq("AGTACACTGGT", IUPAC.extended_dna)
print(my_seq3.alphabet)

my_prot = Seq("AGTACACTGGT", IUPAC.protein)
print(my_prot)
print(my_prot.alphabet)

##seq object act like string
for index, letter in enumerate(my_seq):
    print("%i %s" % (index, letter))

print(len(my_seq))
print(my_seq[0]) #first letter
print(my_seq[-1]) #last letter

print(my_seq.count("AC"))

##GC content
print(100*float(my_seq.count("G")+my_seq.count("C"))/len(my_seq))

from Bio.SeqUtils import GC
print(GC(my_seq))


##Slicing a sequence
my_seq = Seq("GATCGATGGGCCTATATAGGATCGAAAATCGC", IUPAC.unambiguous_dna)
print(my_seq[4:12])
print(my_seq[0::3])
print(my_seq[1::3])
print(my_seq[::-1]) #reverse
fasta_format_string = ">Name\n%s\n" % my_seq
コード例 #50
0
# Sequence 
seq = Seq("GGACCTGGAACAGGCTGAACCCTTTATCCACCTCTCTCCAATTATACCTATCATCCTAACTTCTCAGTGGACCTAACAATCTTCTCCCTTCATCTAGCAGGAGTC")

# Alphabet
seq.alphabet

# Check type 
type(seq.alphabet)

# Find sub-sequence: if TRUE <- SubSeq Position, else <- return -1 
seq.find("ATC")

seq.find("ATGC")

# Number of `A`
seq.count("A")

# Number of `C`
seq.count("C")

# Number of `T`
seq.count("T")

# Number of `G`
seq.count("G")

# K-mer analysis, K = 2(AA)<--dimer
seq.count("AA")

# K-mer analysis, K = 3(AAA)<--trimer
seq.count("AAA")
コード例 #51
0
ファイル: ProtParam.py プロジェクト: BIGLabHYU/biopython
class ProteinAnalysis(object):
    """Class containing methods for protein analysis.

    The constructor takes two arguments.
    The first is the protein sequence as a string, which is then converted to a
    sequence object using the Bio.Seq module. This is done just to make sure
    the sequence is a protein sequence and not anything else.

    The second argument is optional. If set to True, the weight of the amino
    acids will be calculated using their monoisotopic mass (the weight of the
    most abundant isotopes for each element), instead of the average molecular
    mass (the averaged weight of all stable isotopes for each element).
    If set to false (the default value) or left out, the IUPAC average
    molecular mass will be used for the calculation.

    """
    def __init__(self, prot_sequence, monoisotopic=False):
        if prot_sequence.islower():
            self.sequence = Seq(prot_sequence.upper(), IUPAC.protein)
        else:
            self.sequence = Seq(prot_sequence, IUPAC.protein)
        self.amino_acids_content = None
        self.amino_acids_percent = None
        self.length = len(self.sequence)
        self.monoisotopic = monoisotopic

    def count_amino_acids(self):
        """Count standard amino acids, returns a dict.

        Counts the number times each amino acid is in the protein
        sequence. Returns a dictionary {AminoAcid:Number}.

        The return value is cached in self.amino_acids_content.
        It is not recalculated upon subsequent calls.
        """
        if self.amino_acids_content is None:
            prot_dic = dict((k, 0) for k in IUPACData.protein_letters)
            for aa in prot_dic:
                prot_dic[aa] = self.sequence.count(aa)

            self.amino_acids_content = prot_dic

        return self.amino_acids_content

    def get_amino_acids_percent(self):
        """Calculate the amino acid content in percentages.

        The same as count_amino_acids only returns the Number in percentage of
        entire sequence. Returns a dictionary of {AminoAcid:percentage}.

        The return value is cached in self.amino_acids_percent.

        input is the dictionary self.amino_acids_content.
        output is a dictionary with amino acids as keys.
        """
        if self.amino_acids_percent is None:
            aa_counts = self.count_amino_acids()

            percentages = {}
            for aa in aa_counts:
                percentages[aa] = aa_counts[aa] / float(self.length)

            self.amino_acids_percent = percentages

        return self.amino_acids_percent

    def molecular_weight(self):
        """Calculate MW from Protein sequence"""
        return molecular_weight(self.sequence, monoisotopic=self.monoisotopic)

    def aromaticity(self):
        """Calculate the aromaticity according to Lobry, 1994.

        Calculates the aromaticity value of a protein according to Lobry, 1994.
        It is simply the relative frequency of Phe+Trp+Tyr.
        """
        aromatic_aas = 'YWF'
        aa_percentages = self.get_amino_acids_percent()

        aromaticity = sum(aa_percentages[aa] for aa in aromatic_aas)

        return aromaticity

    def instability_index(self):
        """Calculate the instability index according to Guruprasad et al 1990.

        Implementation of the method of Guruprasad et al. 1990 to test a
        protein for stability. Any value above 40 means the protein is unstable
        (has a short half life).

        See: Guruprasad K., Reddy B.V.B., Pandit M.W.
        Protein Engineering 4:155-161(1990).
        """
        index = ProtParamData.DIWV
        score = 0.0

        for i in range(self.length - 1):
            this, next = self.sequence[i:i + 2]
            dipeptide_value = index[this][next]
            score += dipeptide_value

        return (10.0 / self.length) * score

    def flexibility(self):
        """Calculate the flexibility according to Vihinen, 1994.

        No argument to change window size because parameters are specific for a
        window=9. The parameters used are optimized for determining the flexibility.
        """
        flexibilities = ProtParamData.Flex
        window_size = 9
        weights = [0.25, 0.4375, 0.625, 0.8125, 1]
        scores = []

        for i in range(self.length - window_size):
            subsequence = self.sequence[i:i + window_size]
            score = 0.0

            for j in range(window_size // 2):
                front = subsequence[j]
                back = subsequence[window_size - j - 1]
                score += (flexibilities[front] + flexibilities[back]) * weights[j]

            middle = subsequence[window_size // 2 + 1]
            score += flexibilities[middle]

            scores.append(score / 5.25)

        return scores

    def gravy(self):
        """Calculate the gravy according to Kyte and Doolittle."""
        total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence)

        return total_gravy / self.length

    def _weight_list(self, window, edge):
        """Makes a list of relative weight of the
        window edges compared to the window center. The weights are linear.
        it actually generates half a list. For a window of size 9 and edge 0.4
        you get a list of [0.4, 0.55, 0.7, 0.85].
        """
        unit = 2 * (1.0 - edge) / (window - 1)
        weights = [0.0] * (window // 2)

        for i in range(window // 2):
            weights[i] = edge + unit * i

        return weights

    def protein_scale(self, param_dict, window, edge=1.0):
        """Compute a profile by any amino acid scale.

        An amino acid scale is defined by a numerical value assigned to each type of
        amino acid. The most frequently used scales are the hydrophobicity or
        hydrophilicity scales and the secondary structure conformational parameters
        scales, but many other scales exist which are based on different chemical and
        physical properties of the amino acids.  You can set several parameters that
        control the computation  of a scale profile, such as the window size and the
        window edge relative weight value.

        WindowSize: The window size is the length
        of the interval to use for the profile computation. For a window size n, we
        use the i-(n-1)/2 neighboring residues on each side to compute
        the score for residue i. The score for residue i is the sum of the scaled values
        for these amino acids, optionally weighted according to their position in the
        window.

        Edge: The central amino acid of the window always has a weight of 1.
        By default, the amino acids at the remaining window positions have the same
        weight, but you can make the residue at the center of the window  have a
        larger weight than the others by setting the edge value for the  residues at
        the beginning and end of the interval to a value between 0 and 1. For
        instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7,
        1.0, 0.7, 0.4.

        The method returns a list of values which can be plotted to
        view the change along a protein sequence.  Many scales exist. Just add your
        favorites to the ProtParamData modules.

        Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl
        """
        # generate the weights
        #   _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4]
        #   what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done
        #   in the loop.
        weights = self._weight_list(window, edge)
        scores = []

        # the score in each Window is divided by the sum of weights
        # (* 2 + 1) since the weight list is one sided:
        sum_of_weights = sum(weights) * 2 + 1

        for i in range(self.length - window + 1):
            subsequence = self.sequence[i:i + window]
            score = 0.0

            for j in range(window // 2):
                # walk from the outside of the Window towards the middle.
                # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid
                try:
                    front = param_dict[subsequence[j]]
                    back = param_dict[subsequence[window - j - 1]]
                    score += weights[j] * front + weights[j] * back
                except KeyError:
                    sys.stderr.write('warning: %s or %s is not a standard amino acid.\n' %
                             (subsequence[j], subsequence[window - j - 1]))

            # Now add the middle value, which always has a weight of 1.
            middle = subsequence[window // 2]
            if middle in param_dict:
                score += param_dict[middle]
            else:
                sys.stderr.write('warning: %s  is not a standard amino acid.\n' % (middle))

            scores.append(score / sum_of_weights)

        return scores

    def isoelectric_point(self):
        """Calculate the isoelectric point.

        Uses the module IsoelectricPoint to calculate the pI of a protein.
        """
        aa_content = self.count_amino_acids()

        ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
        return ie_point.pi()

    def secondary_structure_fraction(self):
        """Calculate fraction of helix, turn and sheet.

        Returns a list of the fraction of amino acids which tend
        to be in Helix, Turn or Sheet.

        Amino acids in helix: V, I, Y, F, W, L.
        Amino acids in Turn: N, P, G, S.
        Amino acids in sheet: E, M, A, L.

        Returns a tuple of three integers (Helix, Turn, Sheet).
        """
        aa_percentages = self.get_amino_acids_percent()

        helix = sum(aa_percentages[r] for r in 'VIYFWL')
        turn = sum(aa_percentages[r] for r in 'NPGS')
        sheet = sum(aa_percentages[r] for r in 'EMAL')

        return helix, turn, sheet
コード例 #52
0
# 4.4.2.gc_contents.py
from Bio.Seq import Seq

exon_seq = Seq("ATGCAGTAG")
g_count = exon_seq.count("G")
c_count = exon_seq.count("C")
gc_contents = (g_count + c_count) / len(exon_seq) * 100
print(gc_contents)  # 44.44
コード例 #53
0
def readFamilySequences(fileList, temparr, family):    #temparr is the ancestorSeq
    arrforall = []    #the whole arr
    for x in fileList:
        #fileptr2 = open(x);
        fileptr2 = bz2.BZ2File(x, 'r')

        correctFamily2 = 0;      #this part is to find the family in .align2 and reduce dashes
        arrelement = []   #each element of arrforall contains (origseq,organismseq,coordinates in MIRb)
        coor_arr = []     #the coordinates in MIRb line
        for line in fileptr2:
            if line[0] == '#':    # KARRO: Allow us to comment out lines int the file (for testing)
                continue
            line = line.rstrip()
            arr = re.split("\s+", line)
            if len(arr) > 1:
                if arr[1] == family:
                    coor_arr.append(int(arr[2]))  # KARRO: Changed these to ints for consistancy. 
                    coor_arr.append(int(arr[3]))
                    correctFamily2 = 1
                    line = "\t".join(arr)
                else:
                    correctFamily2 = 0;
            else:
                if correctFamily2 == 1:        #organism sequece line
                    correctFamily2 = 2
                    organism_sequence = line
                elif correctFamily2 == 2:      #original sequnce line
                    origi_sequence = line
                    line = line.lower()
                    arr2 = re.split("-+", line)    #reduce dashes in "line"
                    line = "".join(arr2)
                    r0 = re.search(line, temparr)  #search modified origi in MIRb
                    if r0 is None:
                        line = Seq(line)       #to use biopython
                        line = line.reverse_complement()
                        line = str(line)
                        organism_sequence = Seq(organism_sequence)
                        organism_sequence = organism_sequence.reverse_complement()
                        organism_sequence = str(organism_sequence)
                        origi_sequence = Seq(origi_sequence)
                        origi_sequence = origi_sequence.reverse_complement()
                        origi_sequence = str(origi_sequence)

                    
                        
                    temparr2 = temparr.replace(line, origi_sequence)  #change MIRb to original
                    #print temparr2        #MIRb with original changed
                    #print line            #lower without dashes original piece
                    pat = origi_sequence
                    r1 = re.search(pat, temparr2)
		   # print "pat:   ", pat
		   # print "temparr:   ", temparr
		   # print "temparr2:   ", temparr2
                    start = r1.start()
                    end = r1.end()

                    """Not sure about this four while parts"""
                    while organism_sequence[0] == "-":                 # to remove the dashes at the front of the organism_Seq and at the rear of it
                        organism_sequence = organism_sequence[1:]
                        origi_sequence = origi_sequence[1:]
                        start = start + 1
                    while organism_sequence[len(organism_sequence)-1] == "-":
                        organism_sequence = organism_sequence[:-1]
                        origi_sequence = origi_sequence[:-1]
                        end = end - 1
                    if origi_sequence.count('-') == len(origi_sequence):   # skip if the original_Seq contains only "-"
                        continue
                    
                    while origi_sequence[0] == "-":
                        organism_sequence = organism_sequence[1:]
                        origi_sequence = origi_sequence[1:]
                    while origi_sequence[len(origi_sequence)-1] == "-":
                        organism_sequence = organism_sequence[:-1]
                        origi_sequence = origi_sequence[:-1]
                    
                    arrelement = [origi_sequence, organism_sequence, start, start, coor_arr[0], coor_arr[1]]
                    arrforall.append(arrelement)
                    coor_arr = []
                    #print arrforall
                    #print "\n"
    #print arrforall
    return arrforall
コード例 #54
0
ファイル: ini.py プロジェクト: patcon1/rosalind
__author__ = 'pconn'

# first go at binf armory

from Bio.Seq import Seq

instring = Seq(input('put in the string'))
print(instring.count("A"), instring.count("C"),
      instring.count("G"), instring.count("T"))

コード例 #55
0

from Bio.Seq import Seq 

my_seq = Seq("AGTACACTGGT")

print(my_seq)

print(len(my_seq))

print(my_seq[0])
print(my_seq[1])

print(my_seq[0:3])

my_seq.count("G")
my_seq.count("C")
my_seq.count("A")
my_seq.count("T")

my_seq.lower()
my_seq.upper()
my_seq.complement()
my_seq.reverse_complement()
my_seq.transcribe()
my_seq.translate()


from Bio.SeqUtils import GC

#get GC content
コード例 #56
0
        axis=1)
    idx = potential_codons['overall_score'].idxmax()
    # Search back and find what codons corresponds to this index with maximum overall score
    better_codons += codon_to_acid_with_freq_and_at_count.loc[idx].codon

better_seq = Seq(better_codons, generic_rna)


def print_separator():
    """
    This simply prints a bar that is the width of the current terminal.
    """
    print('=' * shutil.get_terminal_size((80, 20)).columns, end='')


g_count, c_count, a_count, t_count = (better_seq.count('G'),
                                      better_seq.count('C'),
                                      better_seq.count('A'),
                                      better_seq.count('T'))

print_separator()
print(
    f'Optimizing for {args.weight:.2%} A/T count (decrease G/C) and {1.0 - args.weight:.2%} frequency in human genome'
)
print_separator()
print(better_seq)
print_separator()
print(
    f'Total Nucleotides: {len(better_seq)}. G Count: {g_count}, C Count: {c_count}, A Count: {a_count}, T Count: {t_count}'
)
print_separator()
コード例 #57
0
ファイル: 3_1_protein.py プロジェクト: alanroche/biopy
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
my_seq = Seq("AGTACACTGGT", IUPAC.protein)
print("SEQ: ", my_seq)
print("ALPHABET: ", my_seq.alphabet)

assert 'A' == my_seq[0]
assert my_seq.count("T") == 3

my_seq = Seq('GATCGATGGGCCTATATAGGATCGAAAATCGC', IUPAC.unambiguous_dna)

print ('GC%=', 100 * float(my_seq.count("G") + my_seq.count("C")) / len(my_seq))
コード例 #58
0
from Bio.Seq import Seq
my_seq = Seq(
    "ACATGCCACTCCTGCGCGCCAACAAAGGTGTAGTTGGCCGAAGACCTCATAATCATGGGACCTCCAACACCCGATAATGTCTATGGATGCGCCCTCCGATTTAGGTCGGCGCCTAATGACGATCCACCAAGCTCCCACGCCATGGTACACCCCGAAGCGGGTACCATGTGCACAGTATGCCGACGGGCGTACATATAAATTGGATACTGGATACCTGGATGGGCCTAGAGGAATTAAATCGGCCTCTGTCCTACCTTGTCCGCATAGTCCCACGCTTACATTTGGAACGGGAAGAGACCAGTCTAGACGCATTGCCAACATCACGTCGGACTAGATCCGCGTATACGTCGGCTAACCTCTTCGGAGGGTCGCGGTGACCATGGCCTGCTAGACTACAGCTTCGAATGTCCCTGTGGGTCCGTTACCCGGTCTGCCGGGTTGTATGTACCTCTGTATTCACTATACCATACTGTGCATCGGGCCTAGCAATCCGATATCCTATTCGCCGCCTCGCACTCAGCAAGACTGGTAAGTTCCTAAGGTCCGCTGATCCAGGGAGCAGGGCAGTAAAGGTCATTTATACATAGTCCGAGTGACAGACCGCAAAGTACACTCGGGTTCTATTGTATCCGTGGGCCCATTCGTAATTGCATGTGAGCACTCGTTCCCTCGTCAATATATAATTGATCACCTGTTCGAACCCTTACGGCCTCATGGCACTTCGTGTACGTTCGCAGATTAACCCAAGGCCGAGAAGCGGAGTCCGGTGAAAACCACCACTGAGGAATCGATACGGAGGCTGTTAAAGAACCACTTTTCTTATCCTTGACCTCACCACGGGCTTCATTCATCTCCTTACGGCGCTGG"
)
p = str(my_seq.count("A")) + " " + str(my_seq.count("C")) + " " + str(
    my_seq.count("G")) + " " + str(my_seq.count("T"))
print(p)
コード例 #59
0
ファイル: ini.py プロジェクト: luizirber/rosalind
#!/usr/bin/env python

from __future__ import print_function
import os

from Bio.Seq import Seq


if __name__ == "__main__":
    with open(os.path.join('data', 'rosalind_ini.txt')) as dataset:
        seq = Seq(dataset.read().rstrip())
        print(*[seq.count(a) for a in ('A', 'C', 'G', 'T')])
コード例 #60
0
ファイル: ProtParam.py プロジェクト: olgabot/biopython
class ProteinAnalysis(object):
    """Class containing methods for protein analysis.

    The constructor takes two arguments.
    The first is the protein sequence as a string, which is then converted to a
    sequence object using the Bio.Seq module. This is done just to make sure
    the sequence is a protein sequence and not anything else.

    The second argument is optional. If set to True, the weight of the amino
    acids will be calculated using their monoisotopic mass (the weight of the
    most abundant isotopes for each element), instead of the average molecular
    mass (the averaged weight of all stable isotopes for each element).
    If set to false (the default value) or left out, the IUPAC average
    molecular mass will be used for the calculation.

    """
    def __init__(self, prot_sequence, monoisotopic=False):
        if prot_sequence.islower():
            self.sequence = Seq(prot_sequence.upper(), IUPAC.protein)
        else:
            self.sequence = Seq(prot_sequence, IUPAC.protein)
        self.amino_acids_content = None
        self.amino_acids_percent = None
        self.length = len(self.sequence)
        self.monoisotopic = monoisotopic

    def count_amino_acids(self):
        """Count standard amino acids, returns a dict.
            
        Counts the number times each amino acid is in the protein
        sequence. Returns a dictionary {AminoAcid:Number}.
        
        The return value is cached in self.amino_acids_content.
        It is not recalculated upon subsequent calls.
        """
        if self.amino_acids_content is None:
            prot_dic = dict([(k, 0) for k in IUPACData.protein_letters])
            for aa in prot_dic:
                prot_dic[aa] = self.sequence.count(aa)

            self.amino_acids_content = prot_dic

        return self.amino_acids_content

    def get_amino_acids_percent(self):
        """Calculate the amino acid content in percentages.

        The same as count_amino_acids only returns the Number in percentage of
        entire sequence. Returns a dictionary of {AminoAcid:percentage}.
        
        The return value is cached in self.amino_acids_percent.
        
        input is the dictionary self.amino_acids_content.
        output is a dictionary with amino acids as keys.
        """
        if self.amino_acids_percent is None:
            aa_counts = self.count_amino_acids()

            percentages = {}
            for aa in aa_counts:
                percentages[aa] = aa_counts[aa] / float(self.length)

            self.amino_acids_percent = percentages

        return self.amino_acids_percent

    def molecular_weight(self):
        """Calculate MW from Protein sequence"""
        # make local dictionary for speed
        if self.monoisotopic:
            water = 18.01
            iupac_weights = IUPACData.monoisotopic_protein_weights
        else:
            iupac_weights = IUPACData.protein_weights
            water = 18.02

        aa_weights = {}
        for i in iupac_weights:
            # remove a molecule of water from the amino acid weight
            aa_weights[i] = iupac_weights[i] - water

        total_weight = water  # add just one water molecule for the whole sequence
        for aa in self.sequence:
            total_weight += aa_weights[aa]

        return total_weight

    def aromaticity(self):
        """Calculate the aromaticity according to Lobry, 1994.

        Calculates the aromaticity value of a protein according to Lobry, 1994.
        It is simply the relative frequency of Phe+Trp+Tyr.
        """
        aromatic_aas = 'YWF'
        aa_percentages = self.get_amino_acids_percent()

        aromaticity = sum([aa_percentages[aa] for aa in aromatic_aas])

        return aromaticity

    def instability_index(self):
        """Calculate the instability index according to Guruprasad et al 1990.

        Implementation of the method of Guruprasad et al. 1990 to test a
        protein for stability. Any value above 40 means the protein is unstable
        (has a short half life). 
        
        See: Guruprasad K., Reddy B.V.B., Pandit M.W.
        Protein Engineering 4:155-161(1990).
        """
        index = ProtParamData.DIWV
        score = 0.0

        for i in range(self.length - 1):
            this, next = self.sequence[i:i + 2]
            dipeptide_value = index[this][next]
            score += dipeptide_value

        return (10.0 / self.length) * score

    def flexibility(self):
        """Calculate the flexibility according to Vihinen, 1994.
        
        No argument to change window size because parameters are specific for a
        window=9. The parameters used are optimized for determining the flexibility.
        """
        flexibilities = ProtParamData.Flex
        window_size = 9
        weights = [0.25, 0.4375, 0.625, 0.8125, 1]
        scores = []

        for i in range(self.length - window_size):
            subsequence = self.sequence[i:i + window_size]
            score = 0.0

            for j in range(window_size // 2):
                front = subsequence[j]
                back = subsequence[window_size - j - 1]
                score += (flexibilities[front] +
                          flexibilities[back]) * weights[j]

            middle = subsequence[window_size // 2 + 1]
            score += flexibilities[middle]

            scores.append(score / 5.25)

        return scores

    def gravy(self):
        """Calculate the gravy according to Kyte and Doolittle."""
        total_gravy = sum(ProtParamData.kd[aa] for aa in self.sequence)

        return total_gravy / self.length

    def _weight_list(self, window, edge):
        """Makes a list of relative weight of the
        window edges compared to the window center. The weights are linear.
        it actually generates half a list. For a window of size 9 and edge 0.4
        you get a list of [0.4, 0.55, 0.7, 0.85]. 
        """
        unit = 2 * (1.0 - edge) / (window - 1)
        weights = [0.0] * (window // 2)

        for i in range(window // 2):
            weights[i] = edge + unit * i

        return weights

    def protein_scale(self, param_dict, window, edge=1.0):
        """Compute a profile by any amino acid scale.
        
        An amino acid scale is defined by a numerical value assigned to each type of
        amino acid. The most frequently used scales are the hydrophobicity or
        hydrophilicity scales and the secondary structure conformational parameters
        scales, but many other scales exist which are based on different chemical and
        physical properties of the amino acids.  You can set several parameters that
        control the computation  of a scale profile, such as the window size and the
        window edge relative weight value.  
        
        WindowSize: The window size is the length
        of the interval to use for the profile computation. For a window size n, we
        use the i-(n-1)/2 neighboring residues on each side to compute
        the score for residue i. The score for residue i is the sum of the scaled values
        for these amino acids, optionally weighted according to their position in the
        window.  
        
        Edge: The central amino acid of the window always has a weight of 1.
        By default, the amino acids at the remaining window positions have the same
        weight, but you can make the residue at the center of the window  have a
        larger weight than the others by setting the edge value for the  residues at
        the beginning and end of the interval to a value between 0 and 1. For
        instance, for Edge=0.4 and a window size of 5 the weights will be: 0.4, 0.7,
        1.0, 0.7, 0.4.  
        
        The method returns a list of values which can be plotted to
        view the change along a protein sequence.  Many scales exist. Just add your
        favorites to the ProtParamData modules.

        Similar to expasy's ProtScale: http://www.expasy.org/cgi-bin/protscale.pl
        """
        # generate the weights
        #   _weight_list returns only one tail. If the list should be [0.4,0.7,1.0,0.7,0.4]
        #   what you actually get from _weights_list is [0.4,0.7]. The correct calculation is done
        #   in the loop.
        weights = self._weight_list(window, edge)
        scores = []

        # the score in each Window is divided by the sum of weights
        # (* 2 + 1) since the weight list is one sided:
        sum_of_weights = sum(weights) * 2 + 1

        for i in range(self.length - window + 1):
            subsequence = self.sequence[i:i + window]
            score = 0.0

            for j in range(window // 2):
                # walk from the outside of the Window towards the middle.
                # Iddo: try/except clauses added to avoid raising an exception on a non-standard amino acid
                try:
                    front = param_dict[subsequence[j]]
                    back = param_dict[subsequence[window - j - 1]]
                    score += weights[j] * front + weights[j] * back
                except KeyError:
                    sys.stderr.write(
                        'warning: %s or %s is not a standard amino acid.\n' %
                        (subsequence[j], subsequence[window - j - 1]))

            # Now add the middle value, which always has a weight of 1.
            middle = subsequence[window // 2]
            if middle in param_dict:
                score += param_dict[middle]
            else:
                sys.stderr.write(
                    'warning: %s  is not a standard amino acid.\n' % (middle))

            scores.append(score / sum_of_weights)

        return scores

    def isoelectric_point(self):
        """Calculate the isoelectric point.
        
        Uses the module IsoelectricPoint to calculate the pI of a protein.
        """
        aa_content = self.count_amino_acids()

        ie_point = IsoelectricPoint.IsoelectricPoint(self.sequence, aa_content)
        return ie_point.pi()

    def secondary_structure_fraction(self):
        """Calculate fraction of helix, turn and sheet.
        
        Returns a list of the fraction of amino acids which tend
        to be in Helix, Turn or Sheet.
        
        Amino acids in helix: V, I, Y, F, W, L.
        Amino acids in Turn: N, P, G, S.
        Amino acids in sheet: E, M, A, L.
        
        Returns a tuple of three integers (Helix, Turn, Sheet).
        """
        aa_percentages = self.get_amino_acids_percent()

        helix = sum([aa_percentages[r] for r in 'VIYFWL'])
        turn = sum([aa_percentages[r] for r in 'NPGS'])
        sheet = sum([aa_percentages[r] for r in 'EMAL'])

        return helix, turn, sheet