def kmer_map(input_file, k, get_seq=True): """ Creates dictionary with kmer keys. If get_seq = True, include sequence names and locations. If get_seq = False, only include the sequence location. input_file: FASTA file k: kmer length get_seq: Parameter that determines if gene name is included returns dictionary of kmery key + values """ kmer_dict = {} for ident, sequence in fasta.FASTAReader( input_file): #Execute FASTAreader on query or reference sequence = sequence.upper() for i in range(0, len(sequence) - k): kmer = sequence[i:i + k] #Get k length slice of file if get_seq: #If get_seq = True, include gene name and location item = (ident, i) else: item = i #If gene_seq = False, only include the location. Since only one gene in droYak. if kmer not in kmer_dict: kmer_dict[kmer] = [item] #Create list at new key else: kmer_dict[kmer].append(item) return kmer_dict
#!/usr/bin/env python import sys import fasta import itertools from itertools import izip d_file = open(sys.argv[1]) a_file = open(sys.argv[2]) align_file = open("alignment_nuc1.fa", "w") for (d_ident, d_seq), (a_ident, a_seq) in itertools.izip(fasta.FASTAReader(d_file), fasta.FASTAReader(a_file)): position = 0 for a in a_seq: if a == "-": align_file.write("---") else: align_file.write(d_seq[position:position + 3]) position = position + 1 align_file.write("\n") print align_file
def outliers_z_score(ys): # threshold = 3 threshold = 0.64 # mean_y = np.mean(ys) mean_y = 0 stdev_y = np.std(ys) z_scores = [(y - mean_y) / stdev_y for y in ys] return np.where(np.abs(z_scores) > threshold) nu_file = open(sys.argv[1]) aa_file = open(sys.argv[2]) out_filename = sys.argv[3] nu_reader = fasta.FASTAReader(nu_file) aa_reader = fasta.FASTAReader(aa_file) # mut = [[codon, aa, dn, ds], [codon, aa, dn, ds], ...] index = 0 mut = [] for (nident, nseq), (aident, aseq) in it.izip(nu_reader, aa_reader): nid = 0 # print aseq for aid in range(len(aseq)): aa = aseq[aid] codon = nseq[nid:nid + 3] if index == 0:
#!/usr/bin/env python """Usage: $ ./alignment.py <alignment_prot.fa> <1000_homologues.fa> <aminout.out>""" """Most of this code was contributed by Tabea. I figured out my own order of opening files and tab separating the amino acid ids from their nucleotide sequences and then returning the lines after. I tested my tab separation within the lines using the commented out part at the bottom and command line entry: $ ./alignment.py <alignment_prot.fa> <1000_homologues.fa> <aminout.out> <aminout.out> | less -S Then I realized that I did want the ids on different lines than the sequences for a fasta format and changed it back.""" import sys import itertools import fasta aminos = fasta.FASTAReader(open(sys.argv[1])) nucleotides = fasta.FASTAReader(open(sys.argv[2])) aminout = open(sys.argv[3], 'w') for (nucname, nuc), (aminame, amino) in itertools.izip(nucleotides, aminos): #aminout.write(nucname + "\t") aminout.write(nucname + "\n") for item in amino: if item == "-": aminout.write("---") else: aminout.write(nuc[:3]) nuc = nuc[3:] aminout.write("\n") # test = open(sys.argv[4]) # for line in test: # print line
#!/usr/bin/env python """Usage: ./N50.py <fasta>""" import sys import fasta contigs = fasta.FASTAReader(open(sys.argv[1])) # for item in contigs: # print item """sort contigs by length, count contigs, sum lengths of contigs, length of contigs/2, find contig closest to length of contigs/2 >=""" sorted_lengths = [] for (name, sequence) in contigs: seq_length = len(sequence) sorted_lengths.append(seq_length) #reverse=True was contributed by Matthew sorted_lengths = sorted(sorted_lengths, reverse=True) #print sorted_lengths total_length = 0 for length in sorted_lengths: total_length = total_length + length print "total length = %d" % (total_length) count = 0 for item in sorted_lengths:
#!/usr/bin/env python3 """ Usage: ./week1_hw.py <aligned AA file> <DNA seq blast>""" # compare aligned amino acid file to DNA sequence to replace DNA sequence with gaps where there are gaps in protein alignment import sys import fasta prot = open(sys.argv[1]) dna = open(sys.argv[2]) prot_reader = fasta.FASTAReader(prot) dna_reader = fasta.FASTAReader(dna) for (prot_id, prot_seq), (dna_id, dna_seq) in zip(prot_reader, dna_reader): dna_mod = [] count = 0 for prot_aa in prot_seq: if prot_aa == "-": dna_mod.append("---") else: dna_mod.append(dna_seq[count:count + 3]) count += 3 print(dna_mod)
""" ./01_N50 <contig.fa> """ import sys import fasta import itertools import matplotlib.pyplot as plt import numpy as np import math fasta_file = open(sys.argv[1]) nucleotide_seq = [] for ident, sequences in fasta.FASTAReader( fasta_file ): nucleotide_seq.append(sequences) print "Statistics for Velvet" nucleotide_length = [] for i in range(len(nucleotide_seq)): nucleotide_length.append(len(nucleotide_seq[i])) nucleotide_length.sort() print "Max = " + str(max(nucleotide_length)) print "Min = " + str(min(nucleotide_length))
#!/usr/bin/env python """ Count kmers in a fasta file """ import sys import fasta kmer_counts = {} k = 5 for ident, sequence in fasta.FASTAReader(sys.stdin): sequence = sequence.upper() for i in range(0, len(sequence) - k): kmer = sequence[i:i + k] if kmer not in kmer_counts: kmer_counts[kmer] = 1 else: kmer_counts[kmer] += 1 for kmer, count in kmer_counts.iteritems(): print kmer, count
contig_lens.sort(reverse=True) l = sum(contig_lens) l2 = float(l) / 2 temp_sum = 0 for length in contig_lens: temp_sum += length if temp_sum > l2: return length file = open(sys.argv[1]) reader = fasta.FASTAReader(file) contig_lens = [] for ident, seq in reader: contig_lens.append(len(seq)) print 'N50 is : ' + str(n50_finder(contig_lens)) print 'Min contig length: ' + str(min(contig_lens)) print 'Max contig length: ' + str(max(contig_lens)) print 'Average contig length: ' + str( float(sum(contig_lens)) / len(contig_lens))
#!/usr/bin/env python """ ./contigs_analyzer.py <contig.fa> <assembler_name> """ import fasta import sys import operator total_l = 0 contigs = [] for name, seq in fasta.FASTAReader(open(sys.argv[1])): if len(seq) == 0: pass sub = [name, seq, len(seq)] total_l += len(seq) contigs.append(sub) contigs = sorted(contigs, key=operator.itemgetter(2), reverse=True) print sys.argv[2] print 'num contigs = %d' % (len(contigs)) print 'max contig length = %d' % (contigs[0][2]) print 'min contig length = %d' % (contigs[-1][2]) print 'avg contig length = %f' % (float(total_l) / float(len(contigs))) ldiv = float(total_l) / 2.0 tot = 0 for each in contigs: tot += each[2]
Gs = sequence.casefold().count('G'.casefold()) Cs = sequence.casefold().count('C'.casefold()) if Gs == 0 and Cs == 0: GCcontent = 0 else: length = len(sequence) GCcontent = (Gs + Cs) / length return GCcontent '''sliding windows''' #1 Mbp windows; slide by 500bp for chromosome in chromosomes: file = '/Users/kateweaver/mm10_genome/chr{}.fa'.format(chromosome) #file = '/home-3/[email protected]/work/users/kweave23/mm10_genome/chr{}.fa'.format(chromosome) reader = fasta.FASTAReader(open(file)) for ident, sequence in reader: window = 0 slides = 0 seqLen = len(sequence) gcList = [] gcMeans = [] starts = [] for i in range(0, seqLen - 1000000, 500): gc = computeGC(sequence[i:i + 1000000]) gcList.append(gc) starts.append(i) window += gc slides += 1 if (i + 1000000) % 10000000 == 0 and i != 0: gcMean = window / slides
selection. PART 4 Plot dN/dS vs. codon position. Color sites under positive selection. """ import sys import fasta from math import sqrt import numpy as np import matplotlib.pyplot as plt plt.style.use('ggplot') from statsmodels.stats import weightstats as stests # Use FASTAReader to read BLAST and MAFFT output files blast = fasta.FASTAReader(open(sys.argv[1])) mafft = fasta.FASTAReader(open(sys.argv[2])) # PART 1 # For every MAFFT AA alignment and its corresponding nucleotide alignment: # Wherever there is a gap in the AA alignment, insert 3 nucleotide gaps (dashes ---) to the nucleotide alignment # Create lists to add gapped DNA and AA alignments to all_nuc_aligns = [] all_aa_aligns = [] # zip iterates through the BLAST and MAFFT files simultaneously # At a given time, you are working with one specific AA alignment and its corresponding nucleotide alignment for (dna_id, dna), (aa_id, aa) in zip(blast, mafft): # Create lists to add gaps/aligned nucleotides and AAs to
#!/usr/bin/env python """ finds matching k-mers between a single query sequence and a database of targets usage: kmer_matcher.py <target.fa> <query.fa> <k> """ import sys import fasta assert len(sys.argv) == 4 target_file = open(sys.argv[1]) query_file = open(sys.argv[2]) k = int(sys.argv[3]) target_iterator = fasta.FASTAReader(target_file) # get query string line = query_file.readline() assert line.startswith(">") sequences = [] while True: line = query_file.readline().rstrip("\r\n") if line == "": break else: sequences.append(line) query_sequence = "".join(sequences).upper()
usage: ./kmer_matcher.py <target.fa> <query.fa> <k> output: target_sequence_name target_start query_start k_mer """ import sys import fasta target = open(sys.argv[1]) query = open(sys.argv[2]) k = int(sys.argv[3]) # kmer length target_kmers = {} query_kmers = {} # target for ident, sequence in fasta.FASTAReader(target): sequence = sequence.upper() for i in range(0, len(sequence) - k): kmer = sequence[i:i + k] if kmer not in target_kmers: target_kmers[kmer] = [(ident, i)] else: target_kmers[kmer].append((ident, i)) # query for ident, sequence in fasta.FASTAReader(query): sequence = sequence.upper() for i in range(0, len(sequence) - k): kmer = sequence[i:i + k] if kmer not in query_kmers: query_kmers[kmer] = [i]
import numpy as np sfile = open(sys.argv[1]) tfile = open(sys.argv[2]) # HoxD70 matrix of Chiaromonte, Yap, Miller 2002, # A C G T sigma = [ [ 91, -114, -31, -123 ], [ -114, 100, -125, -31 ], [ -31, -125, 100, -114 ], [ -123, -31, -114, 91 ] ] gap = 300 hoxd70 = {'A': 0, 'C': 1, 'G': 2, 'T': 3} for ident, sequence in fasta.FASTAReader(sfile): sequence = sequence.upper() s = sequence for ident, sequence in fasta.FASTAReader(tfile): sequence = sequence.upper() t = sequence slen = len(s)+1 tlen = len(t)+1 # create empty matrices score = np.zeros((slen,tlen)) traceback = np.chararray((slen,tlen)) # initialize matrices
#!/usr/bin/env python3 import sys import fasta target_sequence = open(sys.argv[1]) query_sequence = open(sys.argv[2]) reader = fasta.FASTAReader(query_sequence) kmers = {} k = int(sys.argv[3]) for ident, sequence in reader: for i in range(0, len(sequence) - k): kmer = sequence[i:i + k] if kmer not in kmers: kmers[kmer] = [i] else: kmers[kmer].append(i) # else: # kmers[kmer] += 1 reader = fasta.FASTAReader(target_sequence) for ident, sequence in reader: for i in range(0, len(sequence) - k): kmer = sequence[i:i + k] if kmer in kmers: for key in range(len(kmers[kmer])):
nucleotide = open(sys.argv[1]) dN = [] dS = [] #4871 represents the length of the protein sequence # this is building lists that can be indexed later for i in range(0, 4871): dN.append(0) dS.append(0) # imports query and target sequences nucleotide_seq = [] for ident, sequences in fasta.FASTAReader(nucleotide): nucleotide_seq.append(sequences) # list containing query sequence query_seq = nucleotide_seq[:1] # list of target sequences target_seq = nucleotide_seq[1:] #goes through and gets rid of for n in range(len(target_seq)): count = 0 prot_count = 0 while count < 14614: target = target_seq[n][count:count + 3] query = query_seq[0][count:count + 3]
""" Get stats about contigs like min,max,avg, N50. usage: ./contig_stats.py <contigs.fa> """ import sys import fasta #import pandas as pd import numpy as np f=open(sys.argv[1]) lencontig=[] for ident,sequence in fasta.FASTAReader(f): length=len(sequence) lencontig.append(length) lencontig.sort() print "total contigs is", len(lencontig) print "mean is", np.mean(lencontig) print "max is", max(lencontig) print "min is", min(lencontig) print "median is", np.median(lencontig) totl=sum(lencontig) l=0 i=0 while l<totl/2: l=l+lencontig[i]
#!/usr/bin/env python3 # match kmers import sys import fasta target = open(sys.argv[1]) # subset.fa query = open(sys.argv[2]) # droYak2_seq.fa k = int(sys.argv[3]) # use 11 reader = fasta.FASTAReader(target) # use target file (subset.fa) target_dict = {} for ident, sequence in reader: for i in range(0, len(sequence) - k): kmer = sequence[i:i + k] if kmer not in target_dict: target_dict[kmer] = [ (ident, i) ] # kmer as key, gene name and start pos as value else: target_dict[kmer].append((ident, i)) # add tuple in list # for key in target_dict: # print(key, target_dict[key]) reader2 = fasta.FASTAReader(query) # use query file (droYak2_seq) for ident, sequence in reader2: for j in range(0, len(sequence) - k):
""" """ import sys import fasta # import 02-kmer-count target = open(sys.argv[1]) query = open(sys.argv[2]) k = int(sys.argv[3]) target_dict = {} # Put target file into readable format for ident, sequence in fasta.FASTAReader(target): sequence = sequence.upper() for i in range(0, len(sequence) - k): kmer = sequence[i:i + k] if kmer not in target_dict: target_dict = [] target_dict[kmer].append((indent, i)) else: target_dict[kmer].append((indent, i)) print "Target Sequence Name: %s Target Position: %s Query Psition: %s Kmer: %s" #print sequence ident, sequence_q = fasta.FASTAReader(query).next() for i in range(0, len(sequence) - k): qkmer = sequence_q[i:i + k] if q_kmer in target_dict:
#!/usr/bin/env python """ usage <contigs.fa> """ import sys import fasta import numpy as np contig = open(sys.argv[1]) contig_seq = [] for ident, sequence in fasta.FASTAReader(contig): contig_seq.append(sequence) contig_len = [] for i in range(len(contig_seq)): contig_len.append(len(contig_seq[i])) contig_len.sort() mean_contig_len = np.mean(contig_len) print "Min = " + str(min(contig_len)) print "Max = " + str(max(contig_len)) print "Mean = " + str(mean_contig_len) i = 0 k = 0
#!/usr/bin/env python3 """ Prints the target sequence name and start position, query start position, and the kmer matched """ import sys import fasta reader1 = fasta.FASTAReader(sys.stdin) #read the subset.fa target file reader2 = fasta.FASTAReader(open(sys.argv[1])) #read the droYak2_seq.fa query file k = int(sys.argv[2]) query_kmers = {} for ident, sequence in reader2: for posn, v in enumerate(range(0, len(sequence) - k)): kmer = sequence[posn:posn + k] query_kmers[posn] = kmer if kmer not in query_kmers: query_kmers[kmer] = [posn] else: query_kmers[kmer].append(posn) for ident, sequence in reader1: for i, value in enumerate(range(0, len(sequence) - k)): target_kmer = sequence[i:i + k] if target_kmer in query_kmers: print(ident, i, query_kmers[target_kmer], target_kmer)
./realign.py <prot.fa> <nuc.fa> <output_figure> """ import sys import fasta import numpy as np from statsmodels.stats.weightstats import ztest import matplotlib.pyplot as plt aa = open(sys.argv[1]) nuc = open(sys.argv[2]) aa_list = [] nuc_list = [] for ident, seq in fasta.FASTAReader(aa): # Need * for stop codon aa_list.append(seq) for ident, seq in fasta.FASTAReader(nuc): # Split nuc_list into codons codons = [] stop_cods = ['TAG', 'TAA', 'TGA'] for i in range(0, len(seq), 3): if seq[i:i + 3] in stop_cods: pass else: codons.append(seq[i:i + 3]) nuc_list.append(codons)
'TGA': '_', 'TGG': 'W', } new_sequence = open(sys.argv[1]) dn = [] ds = [] #4871 from dividing codons by 3 for i in range(0, 4871): dn.append(0) ds.append(0) nuc_seq = [] for ident, sequences in fasta.FASTAReader(new_sequence): nuc_seq.append(sequences) query_seq = nuc_seq[:1] target_seq = nuc_seq[1:] #print query_seq # samtools faidx new_seq.fa WNFCG_1 for n in range(len(target_seq)): count = 0 prot_count = 0 #14614 is the length of every nucelotide while count < 14614: target = target_seq[n][count:count + 3]
#!/usr/bin/env python3 import sys import fasta reader = fasta.FASTAReader( sys.stdin ) kmers = {} k = 11 for ident, sequence in reader: for i in range( 0, len(sequence) - k): kmer = sequence[i:i + k] if kmer not in kmers: kmers[kmer] = 1 else: kmers[kmer] +=1 for key in kmers: print( key, kmers[key])
this file, subset file, the yak file, and kmer amount input """ import sys import fasta opensub = open(sys.argv[1]) openyak = open(sys.argv[2]) k = int(sys.argv[3]) index = {} #adding stuff to dictionary for ident, sequence in fasta.FASTAReader(opensub): sequence = sequence.upper() for i in range(0, len(sequence) - k): kmer = sequence[i:i + k] if kmer not in index: index[kmer] = [(ident, i)] else: index[kmer].append((ident, i)) #now finding matches and printing count = 0 ident, sequence = fasta.FASTAReader(openyak).next() #for ident, sequence in fasta.FASTAReader(openyak).next(): sequence = sequence.upper()
#!/usr/bin/env python3 # loop through the amino seq but theoutput is nt for the aminos that have letter the ones that dont have letter are sub to - # count is based on pep but out is into the nuc # looping through the pep add counter add in range equation # add to the counter at the end after equation import fasta import sys import numpy as np import matplotlib.pyplot as plt nt = fasta.FASTAReader(open(sys.argv[1])) amino = fasta.FASTAReader(open(sys.argv[2])) # print('start') nt_all = [] for(nt_id, nt_seq), (amino_id, amino_seq) in zip(nt,amino): nuc = [] inj = 0 for pep in amino_seq: if pep is "-": nuc.append("---") else: nseq = nt_seq[inj:inj+3] nuc.append(nseq) inj = inj+3 nt_all.append(nuc) # print(nseq) print(len(nt_all[0])) # print(len(nt_all)) codon = { 'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
#!/usr/bin/env python3 #this script requires 3 inputs: kmer_matcher.py (target fasta) (query fasta) (kmer size) import sys import fasta target = open(sys.argv[1]) reader = fasta.FASTAReader(target) k = sys.argv[3] target_dict = {} #So, brian, tall dude; this is the same code from countkmers.py: but yeilds a dictionary target_dic{kmer,[(gene,pos),(gene 2, pos3)]} #ident, sequence is arbitraryish, right? for ident, sequence in reader: for i in range(0, len(sequence) - int(k)): kmer = sequence[i:i + int(k)] if kmer not in target_dict: target_dict[kmer] = [(ident, i)] else: target_dict[kmer].append((ident, i)) #alright Moron. Yes Brian, thats you. now we generate kmers from the query, and search for those kmers in target_dict. if its there, we are printing out its name and its position in the target file. query = open(sys.argv[2]) reader2 = fasta.FASTAReader(query) for ident2, sequence2 in reader2:
#!/usr/bin/env python3 import sys import fasta import numpy as np from scipy import stats import matplotlib.pyplot as plt import math aa_reader = fasta.FASTAReader(open(sys.argv[1])) dna_reader = fasta.FASTAReader(open(sys.argv[2])) dic = {} Z_test = [] diff = [] sig = [] sig_pos = [] non_sig = [] non_sig_pos = [] rel = 0 for (dna_ident, dna), (aa_ident, aa) in zip(dna_reader, aa_reader): j = 0 gaps = [] AA = [] for i in range(len(aa)): AA.append(aa[i])
import sys, fasta target = open(sys.argv[1]) source = open(sys.argv[2]) lengths = [] k = int(sys.argv[3]) #make a query dictionary kmer_source = {} # put query in FASTA reader. spits out gene name and sequence # when your cursor is within 0 to length (defined by k), ### kmer is the sequence from cursor to cursor + 11 which defines the kmer) ### if the kmer string is not in the dictionary, add to dictionary for ident, sequence in fasta.FASTAReader(source): sequence = sequence.upper() for i in range(0, len(sequence)-k): kmer = sequence[i : i + k] if kmer not in kmer_source: kmer_source[kmer] = [] kmer_source[kmer].append(i) #make target dictionaries ## wh kmer_position = {} for ident, sequence in fasta.FASTAReader(target): sequence = sequence.upper()