Beispiel #1
0
#!/usr/bin/env python3

import argparse
import biotools
import sys

# Write a program that computes the amino acid composition of a protein file
# Use a dictionary
count = {}
tot_count = 0
for id, protein in biotools.read_fasta(sys.argv[1]):
    for aa in protein:
        tot_count += 1
        if aa in count: count[aa] += 1
        else: count[aa] = 1

for aa in count:
    print(aa, count[aa] / tot_count)
"""
python3 composition.py proteins.fasta.gz | sort -nk 2  (numerically by column 2)
* 0.0017561333612916754
W 0.010255968606775905
C 0.019017913309169337
M 0.023765838900038944
H 0.027689991912051043
Y 0.02980558967138963
F 0.036174849474283316
N 0.04593281011293173
I 0.049422610310637154
D 0.052167270766557826
Q 0.05259413473923853
Beispiel #2
0
                    metavar='<str>',
                    help='FASTA file')
parser.add_argument('--pseudo',
                    required=False,
                    type=float,
                    default=1.0,
                    metavar='<float>',
                    help='pseudocount [%(default)f]')
arg = parser.parse_args()

dd = {}  # Dictionary is 2D and contains codon counts for each locus
genome = {
}  # Dictionary is 1D and contains total codon counts for whole genome
sizes = {}  # Number of codons in sequence at each locus
total = 0  # Total counts of all codons in the genome
for name, seq in biotools.read_fasta(arg.file):
    match = re.search('locus_tag=(\w+)', name)  # Find locus tag
    locus = match[1]
    sizes[locus] = 0
    for i in range(0, len(seq) - 2, 3):
        codon = seq[i:i + 3]
        if locus not in dd:  # Adds locus name to dictionary
            dd[locus] = {}
        if codon not in dd[
                locus]:  # First time a codon is seen at a locus, need to add pseudocount to both codon count and size count
            dd[locus][
                codon] = arg.pseudo + 1  # Adds pseudocount plus first codon count
            sizes[
                locus] += arg.pseudo + 1  # Increases size length by pseudocount plus first codon count
        else:  # Every other time codon is seen at a locus, increase codon count and size count by one
            dd[locus][codon] += 1
        dict = OIStable
    elif scale == 'cc':
        dict = CCtable
    else:
        print(f'unsupported scale {scale}')
        return None

    for i in range(len):
        val = dict.get(seq[start + i])
        if val is None:
            print("Bad data: ", seq[start:start + len])
            return None  # Bad data...
        hd += val
    return hd / len


for name, seq in bt.read_fasta(arg.input):
    print(f'>{name}')
    num_below = 0
    for i in range(0, len(seq) - arg.window):
        hd = computeHD(seq, i, arg.window, arg.method)
        if hd == None:
            continue
        print(i, hd)
        if hd < 1:
            num_below += 1
    print(num_below)
"""
python3 hydrophobicity.py --input proteins.fasta.gz --window 11 --method kd
"""
        elif codon == 'TAT': pro.append('Y')
        elif codon == 'TCA': pro.append('S')
        elif codon == 'TCC': pro.append('S')
        elif codon == 'TCT': pro.append('S')
        elif codon == 'TCT': pro.append('S')
        elif codon == 'TGA': pro.append('*')
        elif codon == 'TGC': pro.append('C')
        elif codon == 'TGG': pro.append('W')
        elif codon == 'TGT': pro.append('C')
        elif codon == 'TTA': pro.append('L')
        elif codon == 'TTC': pro.append('F')
        elif codon == 'TTG': pro.append('L')
        elif codon == 'TTT': pro.append('F')
        else: pro.append('X')
    return ''.join(pro)


for name, seq in bt.read_fasta('mRNA.fa.gz'):
    pro = longest_orf(seq)
    if pro != None:
        print(f'>{name}')
        print(pro)
"""
python3 translate_mRNA.py --file ../Lesson05/transcripts.fasta.gz
>CBG00001.1
MTFCENKNLPKPPSDRCQVVVISILSMILDFYLKYNPDKHWAHLFYGASPILEILVIFGMLANSVYGNKLAMFACVLDLVSGVFCLLTLPVISVAENATGVRLHLPYISTFHSQFSFQVSTPVDLFYVATFLGFVSTILILLFLILDALKFMKLRKLRNEDLEKEKKMNPIEKV*
>CBG00006.1
MNGVEKVNKYFDIKDKRDFLYHFGFGVDTLDIKAVFGDTKFVCTGGSPGRFKLYAEWFAKETSIPCSENLSRSDRFVIYKTGPVCWINHGMGTPSLSIMLVESFKLMHHAGVKNPTFIRLGTSGGVGVPPGTVVVSTGAMNAELGDTYVQVIAGKRIERPTQLDATLREALCAVGKEKNIPVETGKTMCADDFYEGQMRLDGYFCDYEEEDKYAFLRKLNSLGVRNIEMESTCFASFTCRAGFPSAIVCVTLLNRMDGDQVQIDKEKYIEYEERPFRLVTAYIRQQTGV*
etc.
"""
                    required=False,
                    type=float,
                    default=2.5,
                    metavar='<float>',
                    help='kd value for signal peptide [%(default)f]')
parser.add_argument('--kd2',
                    required=False,
                    type=float,
                    default=2.0,
                    metavar='<float>',
                    help='kd value for hydrophobic region [%(default)f]')

# finalization
arg = parser.parse_args()

for name, seq in bt.read_fasta(arg.file):
    cond1 = False
    cond2 = False
    # test for signal condition KD > 2.5
    for i in range(0, (30 - arg.win1) + 1):
        assert (i < 23)
        if bt.computeKD(seq, i, arg.win1) > arg.kd1:
            cond1 = True
            break
    if not cond1: continue

    # test for hydrophobic condition KD > 2.0 and no peptide
    for i in range(30, (len(seq) - arg.win2)):  # leave out trailing *
        if bt.computeKD(seq, i, arg.win2) > arg.kd2:
            if 'P' in seq[i:i + arg.win2]:
                continue
             'S': 0.13,'T':0.14,'W':-1.85,'Y':-0.94,'V': 0.07 }

kd = { 'A': 1.8,'R':-4.5,'N':-3.5,'D':-3.5,'C': 2.5,
       'Q':-3.5,'E':-3.5,'G':-0.4,'H':-3.2,'I': 4.5,
       'L': 3.8,'K':-3.9,'M': 1.9,'F': 2.8,'P':-1.6,
       'S':-0.8,'T':-0.7,'W':-0.9,'Y':-1.3,'V': 4.2 }
	   
def hyd(pro, method):
	scale = None
	if   method == 0: scale = is_scale
	elif method == 1: scale = kd
	#else: make it fail
	h = 0
	for aa in pro:
		if aa in scale:
			h += scale[aa]
	return h

for name, pro in biotools.read_fasta(arg.file):
	pro = pro.upper()
	if pro[-1] == '*': pro = pro[0:-1]
	for i in range(0, len(pro) - arg.window + 1):
		win = pro[i:i+arg.window]
		print(i, hyd(win, arg.method))



"""
python3 hydrophobicity.py --input proteins.fasta.gz --window 11 --method kd
"""
#!/usr/bin/env python3

import argparse
import biotools as bt
import sys

# Write a program that computes the amino acid composition of a protein file
# Use a dictionary


count = {}                      # create a dictionary
total = 0
for id, pro in bt.read_fasta(sys.argv[1]):
    for aa in pro:
        total += 1
        if aa in count: count[aa] += 1      
        else:           count[aa] = 1
   
for aa in count:
    print(f'{aa}\t{count[aa]/total:.5f}')

# | sort        sort by first column alphabetically
# | sort -k2    sort by the second column alphabetically (even though they are numbers)
# | sort -nk2   sort by the second column numerically

"""
python3 composition.py proteins.fasta.gz | sort -nk2
* 0.0017561333612916754
W 0.010255968606775905
C 0.019017913309169337
M 0.023765838900038944
Beispiel #8
0
parser.add_argument('--threshold', required=False, type=float, default=1.1,
	metavar='<float>', help='entropy threshold [%(default)f]')

# switches
parser.add_argument('--lowercase', action='store_true',
	help='report lowercase instead of N')
# finalization
arg = parser.parse_args()

def entropy(data):
    h = 0
    for i in range(len(data)):                  # data is list of fractions of each nucleotide
        h -= data[i] * math.log2(data[i])        
    return(h)

for name, seq in biotools.read_fasta(arg.input):
    filtered_seq = list(seq)
    for i in range(0, len(seq) -arg.win+1):
        sseq = seq[i : i+arg.win]
        a, t, g, c = 0.0, 0.0, 0.0, 0.0
        for nt in sseq:
            if   nt == 'A': a += 1.0
            elif nt == 'T': t += 1.0
            elif nt == 'G': g += 1.0
            elif nt == 'C': c += 1.0
            
        a_frac = a/arg.win
        t_frac = t/arg.win
        g_frac = g/arg.win
        c_frac = c/arg.win
            
def pwm_prob(sequence, thre):

    total = 0
    for key, value in pos_weight[0].items():
        total += float(value)

    win = len(pos_weight)
    pos = {}
    for i in range(len(sequence) - win + 1):
        kmer = sequence[i:i + win]
        prob = 1
        for j in range(len(kmer)):
            prob *= float(pos_weight[j][str(kmer[j])]) / total
        if prob > thre:
            pos[i] = prob
    return pos


pos_weight = read_transfac(arg.pwm)

for name, seq in biotools.read_fasta(arg.dna):
    name = name.split(' ')
    for position, probability in pwm_prob(seq, arg.threshold).items():
        print(
            f'{name[0]}\t{position}\t{seq[position:position+len(pos_weight)]}\t{probability:.4f}'
        )
"""
python3 pwm_search.py --dna sars-cov-2.fa.gz --pwm MA0036.1.transfac --threshold 0.01
"""
Beispiel #10
0
    type=int,
    default=100,
    metavar='<int>',
    help='Minimum amino acid length for reporting [%(default)i]')

# Switch
parser.add_argument(
    '--genreport',
    action="store_true",
    help='Whether or not the user wants to see a genome report')

# Finalization
arg = parser.parse_args()

for name, seq in bt.read_fasta(
        arg.file
):  # Program reads FASTA file of genome sequence (was for name, seq in bt.read_fasta(seq):)
    gen_size = len(
        seq
    )  # Calculate the length of the genome size (reported if genreport is switched on)
    gen_name = 0  # Set-up to give genes unique names
    gen_num = 0  # Reports gene number if genreport is switched on
    cds = 0  # Calculates the number of nucleotides that are part of an ORF (coding sequence)
    pos_strand = 0  # Calculates the number of genes on the positive strand
    neg_strand = 0  # Calculates the number of genes on the negative strand
    f_and_r = [
    ]  # Making a list to store F and R sequences (positive and negative strand)
    comp_seq = bt.anti(
        seq)  # Use anti function to create reverse complement and store it
    f_and_r.append(seq)  # Add the forward sequence to the list
    f_and_r.append(comp_seq)  # Add the reverse sequence to the list
# Use a dictionary

# Setup
parser = argparse.ArgumentParser(
	description='Shows amino acid composition of a list of protein files')

# Required Arguments
parser.add_argument('--file', required=True, type=str,
	metavar='<path>', help='Protein File')
    
# Finalization
arg = parser.parse_args()

count = {}
total = 0
for id, protein in bt.read_fasta(arg.file):
	for aa in protein:
		if aa in count: count[aa] += 1
		else:			count[aa] = 1
	total += len(protein)

for aa in count:
	print(aa, count[aa]/total)

"""
python3 composition.py --file ../Week\ 5/proteins.fasta.gz | sort -nk2
* 0.0017561333612916754
W 0.010255968606775905
C 0.019017913309169337
M 0.023765838900038944
H 0.027689991912051043
seq2 = "GCGAGTTCATCTATCACGACCGCGGTCG"

# Format taken from:
# http://resources.qiagenbioinformatics.com/manuals/clcgenomicsworkbench/650/Explanation_BLAST_output.html#fig:ncbiblasttable
#  Score = 22
#  Identities = 15/23 (65.2%), Gaps = 6/23 (26.1%)
#
# Query  1     CTATCACCTGACCTCCAGG-CCG  23
#              :||||||  ||||:|  || | |
# Sbjct  11    ATATCAC--GACCGC--GGTC-G  33

#print("seq1: ",seq1)
#print("seq2: ",seq2)

seq1 = None
for name2, seq2 in biotools.read_fasta(arg.file):
    if seq1 == None:
        seq1 = seq2
        name1 = name2
        print('Query: ', name1)
        continue
    # Allow space for gap in the scoring matrix (e.g. dimension+1)
    rows = len(seq1) + 1
    cols = len(seq2) + 1

    # Initialize the scoring matrix.
    score_matrix, start_pos = create_score_matrix(rows, cols)

    # Find the optimal path through the scoring matrix.
    # This gives the optimal local alighnment
    pos1, aligned1, pos2, aligned2 = traceback(score_matrix, start_pos)
Beispiel #13
0
#!/usr/bin/env python3

import gzip
import sys

# We have imported modules like math, sys, and gzip a few times
# You can also write and import your own modules

import biotools as bt

# The read_fasta() and gc() functions are in biotools.py (take a look)

for name, seq in bt.read_fasta('genome.fa.gz'):
	print(name, len(seq), bt.gc(seq))