#!/usr/bin/env python3 import argparse import biotools import sys # Write a program that computes the amino acid composition of a protein file # Use a dictionary count = {} tot_count = 0 for id, protein in biotools.read_fasta(sys.argv[1]): for aa in protein: tot_count += 1 if aa in count: count[aa] += 1 else: count[aa] = 1 for aa in count: print(aa, count[aa] / tot_count) """ python3 composition.py proteins.fasta.gz | sort -nk 2 (numerically by column 2) * 0.0017561333612916754 W 0.010255968606775905 C 0.019017913309169337 M 0.023765838900038944 H 0.027689991912051043 Y 0.02980558967138963 F 0.036174849474283316 N 0.04593281011293173 I 0.049422610310637154 D 0.052167270766557826 Q 0.05259413473923853
metavar='<str>', help='FASTA file') parser.add_argument('--pseudo', required=False, type=float, default=1.0, metavar='<float>', help='pseudocount [%(default)f]') arg = parser.parse_args() dd = {} # Dictionary is 2D and contains codon counts for each locus genome = { } # Dictionary is 1D and contains total codon counts for whole genome sizes = {} # Number of codons in sequence at each locus total = 0 # Total counts of all codons in the genome for name, seq in biotools.read_fasta(arg.file): match = re.search('locus_tag=(\w+)', name) # Find locus tag locus = match[1] sizes[locus] = 0 for i in range(0, len(seq) - 2, 3): codon = seq[i:i + 3] if locus not in dd: # Adds locus name to dictionary dd[locus] = {} if codon not in dd[ locus]: # First time a codon is seen at a locus, need to add pseudocount to both codon count and size count dd[locus][ codon] = arg.pseudo + 1 # Adds pseudocount plus first codon count sizes[ locus] += arg.pseudo + 1 # Increases size length by pseudocount plus first codon count else: # Every other time codon is seen at a locus, increase codon count and size count by one dd[locus][codon] += 1
dict = OIStable elif scale == 'cc': dict = CCtable else: print(f'unsupported scale {scale}') return None for i in range(len): val = dict.get(seq[start + i]) if val is None: print("Bad data: ", seq[start:start + len]) return None # Bad data... hd += val return hd / len for name, seq in bt.read_fasta(arg.input): print(f'>{name}') num_below = 0 for i in range(0, len(seq) - arg.window): hd = computeHD(seq, i, arg.window, arg.method) if hd == None: continue print(i, hd) if hd < 1: num_below += 1 print(num_below) """ python3 hydrophobicity.py --input proteins.fasta.gz --window 11 --method kd """
elif codon == 'TAT': pro.append('Y') elif codon == 'TCA': pro.append('S') elif codon == 'TCC': pro.append('S') elif codon == 'TCT': pro.append('S') elif codon == 'TCT': pro.append('S') elif codon == 'TGA': pro.append('*') elif codon == 'TGC': pro.append('C') elif codon == 'TGG': pro.append('W') elif codon == 'TGT': pro.append('C') elif codon == 'TTA': pro.append('L') elif codon == 'TTC': pro.append('F') elif codon == 'TTG': pro.append('L') elif codon == 'TTT': pro.append('F') else: pro.append('X') return ''.join(pro) for name, seq in bt.read_fasta('mRNA.fa.gz'): pro = longest_orf(seq) if pro != None: print(f'>{name}') print(pro) """ python3 translate_mRNA.py --file ../Lesson05/transcripts.fasta.gz >CBG00001.1 MTFCENKNLPKPPSDRCQVVVISILSMILDFYLKYNPDKHWAHLFYGASPILEILVIFGMLANSVYGNKLAMFACVLDLVSGVFCLLTLPVISVAENATGVRLHLPYISTFHSQFSFQVSTPVDLFYVATFLGFVSTILILLFLILDALKFMKLRKLRNEDLEKEKKMNPIEKV* >CBG00006.1 MNGVEKVNKYFDIKDKRDFLYHFGFGVDTLDIKAVFGDTKFVCTGGSPGRFKLYAEWFAKETSIPCSENLSRSDRFVIYKTGPVCWINHGMGTPSLSIMLVESFKLMHHAGVKNPTFIRLGTSGGVGVPPGTVVVSTGAMNAELGDTYVQVIAGKRIERPTQLDATLREALCAVGKEKNIPVETGKTMCADDFYEGQMRLDGYFCDYEEEDKYAFLRKLNSLGVRNIEMESTCFASFTCRAGFPSAIVCVTLLNRMDGDQVQIDKEKYIEYEERPFRLVTAYIRQQTGV* etc. """
required=False, type=float, default=2.5, metavar='<float>', help='kd value for signal peptide [%(default)f]') parser.add_argument('--kd2', required=False, type=float, default=2.0, metavar='<float>', help='kd value for hydrophobic region [%(default)f]') # finalization arg = parser.parse_args() for name, seq in bt.read_fasta(arg.file): cond1 = False cond2 = False # test for signal condition KD > 2.5 for i in range(0, (30 - arg.win1) + 1): assert (i < 23) if bt.computeKD(seq, i, arg.win1) > arg.kd1: cond1 = True break if not cond1: continue # test for hydrophobic condition KD > 2.0 and no peptide for i in range(30, (len(seq) - arg.win2)): # leave out trailing * if bt.computeKD(seq, i, arg.win2) > arg.kd2: if 'P' in seq[i:i + arg.win2]: continue
'S': 0.13,'T':0.14,'W':-1.85,'Y':-0.94,'V': 0.07 } kd = { 'A': 1.8,'R':-4.5,'N':-3.5,'D':-3.5,'C': 2.5, 'Q':-3.5,'E':-3.5,'G':-0.4,'H':-3.2,'I': 4.5, 'L': 3.8,'K':-3.9,'M': 1.9,'F': 2.8,'P':-1.6, 'S':-0.8,'T':-0.7,'W':-0.9,'Y':-1.3,'V': 4.2 } def hyd(pro, method): scale = None if method == 0: scale = is_scale elif method == 1: scale = kd #else: make it fail h = 0 for aa in pro: if aa in scale: h += scale[aa] return h for name, pro in biotools.read_fasta(arg.file): pro = pro.upper() if pro[-1] == '*': pro = pro[0:-1] for i in range(0, len(pro) - arg.window + 1): win = pro[i:i+arg.window] print(i, hyd(win, arg.method)) """ python3 hydrophobicity.py --input proteins.fasta.gz --window 11 --method kd """
#!/usr/bin/env python3 import argparse import biotools as bt import sys # Write a program that computes the amino acid composition of a protein file # Use a dictionary count = {} # create a dictionary total = 0 for id, pro in bt.read_fasta(sys.argv[1]): for aa in pro: total += 1 if aa in count: count[aa] += 1 else: count[aa] = 1 for aa in count: print(f'{aa}\t{count[aa]/total:.5f}') # | sort sort by first column alphabetically # | sort -k2 sort by the second column alphabetically (even though they are numbers) # | sort -nk2 sort by the second column numerically """ python3 composition.py proteins.fasta.gz | sort -nk2 * 0.0017561333612916754 W 0.010255968606775905 C 0.019017913309169337 M 0.023765838900038944
parser.add_argument('--threshold', required=False, type=float, default=1.1, metavar='<float>', help='entropy threshold [%(default)f]') # switches parser.add_argument('--lowercase', action='store_true', help='report lowercase instead of N') # finalization arg = parser.parse_args() def entropy(data): h = 0 for i in range(len(data)): # data is list of fractions of each nucleotide h -= data[i] * math.log2(data[i]) return(h) for name, seq in biotools.read_fasta(arg.input): filtered_seq = list(seq) for i in range(0, len(seq) -arg.win+1): sseq = seq[i : i+arg.win] a, t, g, c = 0.0, 0.0, 0.0, 0.0 for nt in sseq: if nt == 'A': a += 1.0 elif nt == 'T': t += 1.0 elif nt == 'G': g += 1.0 elif nt == 'C': c += 1.0 a_frac = a/arg.win t_frac = t/arg.win g_frac = g/arg.win c_frac = c/arg.win
def pwm_prob(sequence, thre): total = 0 for key, value in pos_weight[0].items(): total += float(value) win = len(pos_weight) pos = {} for i in range(len(sequence) - win + 1): kmer = sequence[i:i + win] prob = 1 for j in range(len(kmer)): prob *= float(pos_weight[j][str(kmer[j])]) / total if prob > thre: pos[i] = prob return pos pos_weight = read_transfac(arg.pwm) for name, seq in biotools.read_fasta(arg.dna): name = name.split(' ') for position, probability in pwm_prob(seq, arg.threshold).items(): print( f'{name[0]}\t{position}\t{seq[position:position+len(pos_weight)]}\t{probability:.4f}' ) """ python3 pwm_search.py --dna sars-cov-2.fa.gz --pwm MA0036.1.transfac --threshold 0.01 """
type=int, default=100, metavar='<int>', help='Minimum amino acid length for reporting [%(default)i]') # Switch parser.add_argument( '--genreport', action="store_true", help='Whether or not the user wants to see a genome report') # Finalization arg = parser.parse_args() for name, seq in bt.read_fasta( arg.file ): # Program reads FASTA file of genome sequence (was for name, seq in bt.read_fasta(seq):) gen_size = len( seq ) # Calculate the length of the genome size (reported if genreport is switched on) gen_name = 0 # Set-up to give genes unique names gen_num = 0 # Reports gene number if genreport is switched on cds = 0 # Calculates the number of nucleotides that are part of an ORF (coding sequence) pos_strand = 0 # Calculates the number of genes on the positive strand neg_strand = 0 # Calculates the number of genes on the negative strand f_and_r = [ ] # Making a list to store F and R sequences (positive and negative strand) comp_seq = bt.anti( seq) # Use anti function to create reverse complement and store it f_and_r.append(seq) # Add the forward sequence to the list f_and_r.append(comp_seq) # Add the reverse sequence to the list
# Use a dictionary # Setup parser = argparse.ArgumentParser( description='Shows amino acid composition of a list of protein files') # Required Arguments parser.add_argument('--file', required=True, type=str, metavar='<path>', help='Protein File') # Finalization arg = parser.parse_args() count = {} total = 0 for id, protein in bt.read_fasta(arg.file): for aa in protein: if aa in count: count[aa] += 1 else: count[aa] = 1 total += len(protein) for aa in count: print(aa, count[aa]/total) """ python3 composition.py --file ../Week\ 5/proteins.fasta.gz | sort -nk2 * 0.0017561333612916754 W 0.010255968606775905 C 0.019017913309169337 M 0.023765838900038944 H 0.027689991912051043
seq2 = "GCGAGTTCATCTATCACGACCGCGGTCG" # Format taken from: # http://resources.qiagenbioinformatics.com/manuals/clcgenomicsworkbench/650/Explanation_BLAST_output.html#fig:ncbiblasttable # Score = 22 # Identities = 15/23 (65.2%), Gaps = 6/23 (26.1%) # # Query 1 CTATCACCTGACCTCCAGG-CCG 23 # :|||||| ||||:| || | | # Sbjct 11 ATATCAC--GACCGC--GGTC-G 33 #print("seq1: ",seq1) #print("seq2: ",seq2) seq1 = None for name2, seq2 in biotools.read_fasta(arg.file): if seq1 == None: seq1 = seq2 name1 = name2 print('Query: ', name1) continue # Allow space for gap in the scoring matrix (e.g. dimension+1) rows = len(seq1) + 1 cols = len(seq2) + 1 # Initialize the scoring matrix. score_matrix, start_pos = create_score_matrix(rows, cols) # Find the optimal path through the scoring matrix. # This gives the optimal local alighnment pos1, aligned1, pos2, aligned2 = traceback(score_matrix, start_pos)
#!/usr/bin/env python3 import gzip import sys # We have imported modules like math, sys, and gzip a few times # You can also write and import your own modules import biotools as bt # The read_fasta() and gc() functions are in biotools.py (take a look) for name, seq in bt.read_fasta('genome.fa.gz'): print(name, len(seq), bt.gc(seq))