Ejemplo n.º 1
0
def kmer_map(input_file, k, get_seq=True):
    """
    Creates dictionary with kmer keys. If get_seq = True, include sequence names and locations. 
    If get_seq = False, only include the sequence location.
    
    input_file: FASTA file
    k: kmer length
    get_seq: Parameter that determines if gene name is included
    returns dictionary of kmery key + values
    """
    kmer_dict = {}
    for ident, sequence in fasta.FASTAReader(
            input_file):  #Execute FASTAreader on query or reference
        sequence = sequence.upper()
        for i in range(0, len(sequence) - k):
            kmer = sequence[i:i + k]  #Get k length slice of file
            if get_seq:  #If get_seq  = True, include gene name and location
                item = (ident, i)
            else:
                item = i  #If gene_seq = False, only include the location. Since only one gene in droYak.
            if kmer not in kmer_dict:
                kmer_dict[kmer] = [item]  #Create list at new key
            else:
                kmer_dict[kmer].append(item)
    return kmer_dict
Ejemplo n.º 2
0
#!/usr/bin/env python

import sys
import fasta
import itertools
from itertools import izip

d_file = open(sys.argv[1])
a_file = open(sys.argv[2])
align_file = open("alignment_nuc1.fa", "w")

for (d_ident, d_seq), (a_ident,
                       a_seq) in itertools.izip(fasta.FASTAReader(d_file),
                                                fasta.FASTAReader(a_file)):
    position = 0
    for a in a_seq:
        if a == "-":
            align_file.write("---")
        else:
            align_file.write(d_seq[position:position + 3])
            position = position + 1

    align_file.write("\n")
    print align_file
Ejemplo n.º 3
0
def outliers_z_score(ys):
    # threshold = 3
    threshold = 0.64

    # mean_y = np.mean(ys)
    mean_y = 0
    stdev_y = np.std(ys)
    z_scores = [(y - mean_y) / stdev_y for y in ys]
    return np.where(np.abs(z_scores) > threshold)


nu_file = open(sys.argv[1])
aa_file = open(sys.argv[2])
out_filename = sys.argv[3]

nu_reader = fasta.FASTAReader(nu_file)
aa_reader = fasta.FASTAReader(aa_file)

# mut = [[codon, aa, dn, ds], [codon, aa, dn, ds], ...]

index = 0
mut = []

for (nident, nseq), (aident, aseq) in it.izip(nu_reader, aa_reader):
    nid = 0
    # print aseq
    for aid in range(len(aseq)):
        aa = aseq[aid]
        codon = nseq[nid:nid + 3]

        if index == 0:
Ejemplo n.º 4
0
#!/usr/bin/env python
"""Usage: $ ./alignment.py <alignment_prot.fa> <1000_homologues.fa> <aminout.out>"""
"""Most of this code was contributed by Tabea.  I figured out my own order of opening files and tab separating the amino acid ids from their nucleotide sequences and then returning the lines after.  I tested my tab separation within the lines using the commented out part at the bottom and command line entry: $ ./alignment.py <alignment_prot.fa> <1000_homologues.fa> <aminout.out> <aminout.out> | less -S

Then I realized that I did want the ids on different lines than the sequences for a fasta format and changed it back."""

import sys
import itertools
import fasta

aminos = fasta.FASTAReader(open(sys.argv[1]))
nucleotides = fasta.FASTAReader(open(sys.argv[2]))
aminout = open(sys.argv[3], 'w')

for (nucname, nuc), (aminame, amino) in itertools.izip(nucleotides, aminos):
    #aminout.write(nucname + "\t")
    aminout.write(nucname + "\n")
    for item in amino:
        if item == "-":
            aminout.write("---")
        else:
            aminout.write(nuc[:3])
            nuc = nuc[3:]
    aminout.write("\n")

# test = open(sys.argv[4])
# for line in test:
#     print line
Ejemplo n.º 5
0
#!/usr/bin/env python
"""Usage: ./N50.py <fasta>"""

import sys
import fasta

contigs = fasta.FASTAReader(open(sys.argv[1]))

# for item in contigs:
#     print item
"""sort contigs by length, count contigs, sum lengths of contigs, length of contigs/2, find contig closest to length of contigs/2 >="""

sorted_lengths = []

for (name, sequence) in contigs:
    seq_length = len(sequence)
    sorted_lengths.append(seq_length)

#reverse=True was contributed by Matthew
sorted_lengths = sorted(sorted_lengths, reverse=True)

#print sorted_lengths

total_length = 0
for length in sorted_lengths:
    total_length = total_length + length

print "total length = %d" % (total_length)

count = 0
for item in sorted_lengths:
Ejemplo n.º 6
0
#!/usr/bin/env python3
""" Usage: ./week1_hw.py <aligned AA file> <DNA seq blast>"""
# compare aligned amino acid file to DNA sequence to replace DNA sequence with gaps where there are gaps in protein alignment

import sys
import fasta

prot = open(sys.argv[1])
dna = open(sys.argv[2])

prot_reader = fasta.FASTAReader(prot)
dna_reader = fasta.FASTAReader(dna)

for (prot_id, prot_seq), (dna_id, dna_seq) in zip(prot_reader, dna_reader):

    dna_mod = []
    count = 0
    for prot_aa in prot_seq:
        if prot_aa == "-":
            dna_mod.append("---")
        else:
            dna_mod.append(dna_seq[count:count + 3])
            count += 3

    print(dna_mod)
Ejemplo n.º 7
0
"""
./01_N50 <contig.fa>
"""

import sys
import fasta
import itertools
import matplotlib.pyplot as plt
import numpy as np
import math

fasta_file = open(sys.argv[1])


nucleotide_seq = []
for ident, sequences in fasta.FASTAReader( fasta_file ):
    nucleotide_seq.append(sequences)
    
print "Statistics for Velvet"

nucleotide_length = []
for i in range(len(nucleotide_seq)):
    nucleotide_length.append(len(nucleotide_seq[i]))

nucleotide_length.sort()


print "Max = " + str(max(nucleotide_length))
print "Min = " + str(min(nucleotide_length))

Ejemplo n.º 8
0
#!/usr/bin/env python
"""
Count kmers in a fasta file
"""

import sys
import fasta

kmer_counts = {}
k = 5
for ident, sequence in fasta.FASTAReader(sys.stdin):
    sequence = sequence.upper()
    for i in range(0, len(sequence) - k):
        kmer = sequence[i:i + k]
        if kmer not in kmer_counts:
            kmer_counts[kmer] = 1
        else:
            kmer_counts[kmer] += 1

for kmer, count in kmer_counts.iteritems():
    print kmer, count
Ejemplo n.º 9
0
    contig_lens.sort(reverse=True)

    l = sum(contig_lens)
    l2 = float(l) / 2

    temp_sum = 0

    for length in contig_lens:

        temp_sum += length
        if temp_sum > l2:
            return length


file = open(sys.argv[1])

reader = fasta.FASTAReader(file)

contig_lens = []

for ident, seq in reader:

    contig_lens.append(len(seq))

print 'N50 is : ' + str(n50_finder(contig_lens))
print 'Min contig length: ' + str(min(contig_lens))
print 'Max contig length: ' + str(max(contig_lens))
print 'Average contig length: ' + str(
    float(sum(contig_lens)) / len(contig_lens))
#!/usr/bin/env python
"""
./contigs_analyzer.py <contig.fa> <assembler_name>
"""

import fasta
import sys
import operator

total_l = 0
contigs = []
for name, seq in fasta.FASTAReader(open(sys.argv[1])):
    if len(seq) == 0:
        pass
    sub = [name, seq, len(seq)]
    total_l += len(seq)
    contigs.append(sub)

contigs = sorted(contigs, key=operator.itemgetter(2), reverse=True)

print sys.argv[2]
print 'num contigs = %d' % (len(contigs))
print 'max contig length = %d' % (contigs[0][2])
print 'min contig length = %d' % (contigs[-1][2])
print 'avg contig length = %f' % (float(total_l) / float(len(contigs)))

ldiv = float(total_l) / 2.0

tot = 0
for each in contigs:
    tot += each[2]
Ejemplo n.º 11
0
    Gs = sequence.casefold().count('G'.casefold())
    Cs = sequence.casefold().count('C'.casefold())
    if Gs == 0 and Cs == 0:
        GCcontent = 0
    else:
        length = len(sequence)
        GCcontent = (Gs + Cs) / length
    return GCcontent


'''sliding windows'''  #1 Mbp windows; slide by 500bp

for chromosome in chromosomes:
    file = '/Users/kateweaver/mm10_genome/chr{}.fa'.format(chromosome)
    #file = '/home-3/[email protected]/work/users/kweave23/mm10_genome/chr{}.fa'.format(chromosome)
    reader = fasta.FASTAReader(open(file))
    for ident, sequence in reader:
        window = 0
        slides = 0
        seqLen = len(sequence)
        gcList = []
        gcMeans = []
        starts = []
        for i in range(0, seqLen - 1000000, 500):
            gc = computeGC(sequence[i:i + 1000000])
            gcList.append(gc)
            starts.append(i)
            window += gc
            slides += 1
            if (i + 1000000) % 10000000 == 0 and i != 0:
                gcMean = window / slides
Ejemplo n.º 12
0
selection.

PART 4
Plot dN/dS vs. codon position. Color sites under positive selection.
"""

import sys
import fasta
from math import sqrt
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from statsmodels.stats import weightstats as stests

# Use FASTAReader to read BLAST and MAFFT output files
blast = fasta.FASTAReader(open(sys.argv[1]))
mafft = fasta.FASTAReader(open(sys.argv[2]))

# PART 1
# For every MAFFT AA alignment and its corresponding nucleotide alignment:
# Wherever there is a gap in the AA alignment, insert 3 nucleotide gaps (dashes ---) to the nucleotide alignment

# Create lists to add gapped DNA and AA alignments to
all_nuc_aligns = []
all_aa_aligns = []

# zip iterates through the BLAST and MAFFT files simultaneously
# At a given time, you are working with one specific AA alignment and its corresponding nucleotide alignment
for (dna_id, dna), (aa_id, aa) in zip(blast, mafft):

    # Create lists to add gaps/aligned nucleotides and AAs to
Ejemplo n.º 13
0
#!/usr/bin/env python
"""
finds matching k-mers between a single query sequence and a database of targets
usage: kmer_matcher.py <target.fa> <query.fa> <k>
"""

import sys
import fasta

assert len(sys.argv) == 4

target_file = open(sys.argv[1])
query_file = open(sys.argv[2])
k = int(sys.argv[3])

target_iterator = fasta.FASTAReader(target_file)

# get query string

line = query_file.readline()
assert line.startswith(">")
sequences = []

while True:
    line = query_file.readline().rstrip("\r\n")
    if line == "":
        break
    else:
        sequences.append(line)

query_sequence = "".join(sequences).upper()
Ejemplo n.º 14
0
usage: ./kmer_matcher.py <target.fa> <query.fa> <k>
output: target_sequence_name target_start query_start k_mer
"""

import sys
import fasta

target = open(sys.argv[1])
query = open(sys.argv[2])
k = int(sys.argv[3])  # kmer length

target_kmers = {}
query_kmers = {}

# target
for ident, sequence in fasta.FASTAReader(target):
    sequence = sequence.upper()
    for i in range(0, len(sequence) - k):
        kmer = sequence[i:i + k]
        if kmer not in target_kmers:
            target_kmers[kmer] = [(ident, i)]
        else:
            target_kmers[kmer].append((ident, i))

# query
for ident, sequence in fasta.FASTAReader(query):
    sequence = sequence.upper()
    for i in range(0, len(sequence) - k):
        kmer = sequence[i:i + k]
        if kmer not in query_kmers:
            query_kmers[kmer] = [i]
Ejemplo n.º 15
0
import numpy as np

sfile = open(sys.argv[1])
tfile = open(sys.argv[2])

# HoxD70 matrix of Chiaromonte, Yap, Miller 2002,
#              A     C     G     T
sigma = [ [   91, -114,  -31, -123 ],
          [ -114,  100, -125,  -31 ],
          [  -31, -125,  100, -114 ],
          [ -123,  -31, -114,   91 ] ]

gap = 300
hoxd70 = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

for ident, sequence in fasta.FASTAReader(sfile):
    sequence = sequence.upper()
    s = sequence
    
for ident, sequence in fasta.FASTAReader(tfile):
    sequence = sequence.upper()
    t = sequence

slen = len(s)+1
tlen = len(t)+1

# create empty matrices
score = np.zeros((slen,tlen))
traceback = np.chararray((slen,tlen))
    
# initialize matrices
Ejemplo n.º 16
0
#!/usr/bin/env python3

import sys
import fasta

target_sequence = open(sys.argv[1])
query_sequence = open(sys.argv[2])

reader = fasta.FASTAReader(query_sequence)

kmers = {}

k = int(sys.argv[3])

for ident, sequence in reader:
    for i in range(0, len(sequence) - k):
        kmer = sequence[i:i + k]
        if kmer not in kmers:
            kmers[kmer] = [i]
        else:
            kmers[kmer].append(i)
#        else:
#            kmers[kmer] += 1

reader = fasta.FASTAReader(target_sequence)

for ident, sequence in reader:
    for i in range(0, len(sequence) - k):
        kmer = sequence[i:i + k]
        if kmer in kmers:
            for key in range(len(kmers[kmer])):
Ejemplo n.º 17
0
nucleotide = open(sys.argv[1])

dN = []

dS = []

#4871 represents the length of the protein sequence
# this is building lists that can be indexed later
for i in range(0, 4871):
    dN.append(0)
    dS.append(0)

# imports query and target sequences
nucleotide_seq = []
for ident, sequences in fasta.FASTAReader(nucleotide):
    nucleotide_seq.append(sequences)

# list containing query sequence
query_seq = nucleotide_seq[:1]

# list of target sequences
target_seq = nucleotide_seq[1:]

#goes through and gets rid of
for n in range(len(target_seq)):
    count = 0
    prot_count = 0
    while count < 14614:
        target = target_seq[n][count:count + 3]
        query = query_seq[0][count:count + 3]
"""
Get stats about contigs like min,max,avg, N50.

usage: ./contig_stats.py <contigs.fa>
"""


import sys
import fasta
#import pandas as pd
import numpy as np

f=open(sys.argv[1])

lencontig=[]
for ident,sequence in fasta.FASTAReader(f):
    length=len(sequence)
    lencontig.append(length)
    
lencontig.sort()
print "total contigs is", len(lencontig)
print "mean is", np.mean(lencontig)
print "max is", max(lencontig)
print "min is", min(lencontig)
print "median is", np.median(lencontig)

totl=sum(lencontig)
l=0
i=0
while l<totl/2:
    l=l+lencontig[i]
Ejemplo n.º 19
0
#!/usr/bin/env python3

# match kmers

import sys
import fasta

target = open(sys.argv[1])  # subset.fa
query = open(sys.argv[2])  # droYak2_seq.fa
k = int(sys.argv[3])  # use 11

reader = fasta.FASTAReader(target)  # use target file (subset.fa)

target_dict = {}

for ident, sequence in reader:
    for i in range(0, len(sequence) - k):
        kmer = sequence[i:i + k]
        if kmer not in target_dict:
            target_dict[kmer] = [
                (ident, i)
            ]  # kmer as key, gene name and start pos as value
        else:
            target_dict[kmer].append((ident, i))  # add tuple in list

# for key in target_dict:
#    print(key, target_dict[key])

reader2 = fasta.FASTAReader(query)  # use query file (droYak2_seq)
for ident, sequence in reader2:
    for j in range(0, len(sequence) - k):
"""

"""

import sys
import fasta
# import 02-kmer-count

target = open(sys.argv[1])
query = open(sys.argv[2])
k = int(sys.argv[3])

target_dict = {}

# Put target file into readable format
for ident, sequence in fasta.FASTAReader(target):
    sequence = sequence.upper()
    for i in range(0, len(sequence) - k):
        kmer = sequence[i:i + k]
        if kmer not in target_dict:
            target_dict = []
            target_dict[kmer].append((indent, i))
        else:
            target_dict[kmer].append((indent, i))

print "Target Sequence Name: %s Target Position: %s Query Psition: %s Kmer: %s"
#print sequence
ident, sequence_q = fasta.FASTAReader(query).next()
for i in range(0, len(sequence) - k):
    qkmer = sequence_q[i:i + k]
    if q_kmer in target_dict:
Ejemplo n.º 21
0
#!/usr/bin/env python
"""

usage <contigs.fa>

"""

import sys
import fasta
import numpy as np

contig = open(sys.argv[1])
contig_seq = []

for ident, sequence in fasta.FASTAReader(contig):
    contig_seq.append(sequence)

contig_len = []

for i in range(len(contig_seq)):
    contig_len.append(len(contig_seq[i]))

contig_len.sort()
mean_contig_len = np.mean(contig_len)

print "Min = " + str(min(contig_len))
print "Max = " + str(max(contig_len))
print "Mean = " + str(mean_contig_len)

i = 0
k = 0
Ejemplo n.º 22
0
#!/usr/bin/env python3
"""
Prints the target sequence name and start position, query start position, and the kmer matched
"""

import sys
import fasta

reader1 = fasta.FASTAReader(sys.stdin)
#read the subset.fa target file
reader2 = fasta.FASTAReader(open(sys.argv[1]))
#read the droYak2_seq.fa query file
k = int(sys.argv[2])

query_kmers = {}

for ident, sequence in reader2:
    for posn, v in enumerate(range(0, len(sequence) - k)):
        kmer = sequence[posn:posn + k]
        query_kmers[posn] = kmer
        if kmer not in query_kmers:
            query_kmers[kmer] = [posn]
        else:
            query_kmers[kmer].append(posn)

for ident, sequence in reader1:
    for i, value in enumerate(range(0, len(sequence) - k)):
        target_kmer = sequence[i:i + k]
        if target_kmer in query_kmers:
            print(ident, i, query_kmers[target_kmer], target_kmer)
Ejemplo n.º 23
0
./realign.py <prot.fa> <nuc.fa> <output_figure>
"""

import sys
import fasta
import numpy as np
from statsmodels.stats.weightstats import ztest
import matplotlib.pyplot as plt

aa = open(sys.argv[1])
nuc = open(sys.argv[2])

aa_list = []
nuc_list = []

for ident, seq in fasta.FASTAReader(aa):
    # Need * for stop codon
    aa_list.append(seq)

for ident, seq in fasta.FASTAReader(nuc):
    # Split nuc_list into codons
    codons = []
    stop_cods = ['TAG', 'TAA', 'TGA']
    for i in range(0, len(seq), 3):
        if seq[i:i + 3] in stop_cods:
            pass
        else:
            codons.append(seq[i:i + 3])
    nuc_list.append(codons)

Ejemplo n.º 24
0
    'TGA': '_',
    'TGG': 'W',
}

new_sequence = open(sys.argv[1])

dn = []
ds = []

#4871 from dividing codons by 3
for i in range(0, 4871):
    dn.append(0)
    ds.append(0)

nuc_seq = []
for ident, sequences in fasta.FASTAReader(new_sequence):
    nuc_seq.append(sequences)

query_seq = nuc_seq[:1]
target_seq = nuc_seq[1:]

#print query_seq

# samtools faidx new_seq.fa WNFCG_1

for n in range(len(target_seq)):
    count = 0
    prot_count = 0
    #14614 is the length of every nucelotide
    while count < 14614:
        target = target_seq[n][count:count + 3]
Ejemplo n.º 25
0
#!/usr/bin/env python3

import sys
import fasta

reader = fasta.FASTAReader( sys.stdin )

kmers = {}
k = 11


for ident, sequence in reader:
    for i in range( 0, len(sequence) - k):
        kmer = sequence[i:i + k]
        if kmer not in kmers:
            kmers[kmer] = 1
        else:
            kmers[kmer] +=1

for key in kmers:
    print( key, kmers[key])
Ejemplo n.º 26
0
this file, subset file, the yak file, and kmer amount input
"""

import sys
import fasta

opensub = open(sys.argv[1])
openyak = open(sys.argv[2])

k = int(sys.argv[3])

index = {}

#adding stuff to dictionary

for ident, sequence in fasta.FASTAReader(opensub):
    sequence = sequence.upper()
    for i in range(0, len(sequence) - k):
        kmer = sequence[i:i + k]
        if kmer not in index:
            index[kmer] = [(ident, i)]
        else:
            index[kmer].append((ident, i))

#now finding matches and printing

count = 0

ident, sequence = fasta.FASTAReader(openyak).next()
#for ident, sequence in fasta.FASTAReader(openyak).next():
sequence = sequence.upper()
Ejemplo n.º 27
0
#!/usr/bin/env python3
# loop through the amino seq but theoutput is nt for the aminos that have letter the ones that dont have letter are sub to -
# count is based on pep but out is into the nuc 
# looping through the pep add counter add in range equation 
# add to the counter at the end after equation 
import fasta
import sys
import numpy as np 
import matplotlib.pyplot as plt

nt = fasta.FASTAReader(open(sys.argv[1]))
amino = fasta.FASTAReader(open(sys.argv[2]))
# print('start')
nt_all = []
for(nt_id, nt_seq), (amino_id, amino_seq) in zip(nt,amino):
    nuc = []
    inj = 0
    for pep in amino_seq: 
        if pep is "-":
            nuc.append("---")
        else:
            nseq = nt_seq[inj:inj+3]
            nuc.append(nseq)
        inj = inj+3
        nt_all.append(nuc)
            # print(nseq)
print(len(nt_all[0]))
        # print(len(nt_all))
codon = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
Ejemplo n.º 28
0
#!/usr/bin/env python3

#this script requires 3 inputs: kmer_matcher.py (target fasta) (query fasta) (kmer size)

import sys
import fasta

target = open(sys.argv[1])

reader = fasta.FASTAReader(target)

k = sys.argv[3]

target_dict = {}

#So, brian, tall dude; this is the same code from countkmers.py: but yeilds a dictionary target_dic{kmer,[(gene,pos),(gene 2, pos3)]}
#ident, sequence is arbitraryish, right?
for ident, sequence in reader:
    for i in range(0, len(sequence) - int(k)):
        kmer = sequence[i:i + int(k)]
        if kmer not in target_dict:
            target_dict[kmer] = [(ident, i)]
        else:
            target_dict[kmer].append((ident, i))

#alright Moron. Yes Brian, thats you. now we generate kmers from the query, and search for those kmers in target_dict. if its there, we are printing out its name and its position in the target file.
query = open(sys.argv[2])

reader2 = fasta.FASTAReader(query)

for ident2, sequence2 in reader2:
Ejemplo n.º 29
0
#!/usr/bin/env python3

import sys
import fasta
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import math

aa_reader = fasta.FASTAReader(open(sys.argv[1]))
dna_reader = fasta.FASTAReader(open(sys.argv[2]))

dic = {}
Z_test = []
diff = []

sig = []
sig_pos = []

non_sig = []
non_sig_pos = []

rel = 0


for (dna_ident, dna), (aa_ident, aa) in zip(dna_reader, aa_reader):
   j = 0
   gaps = []
   AA = []
   for i in range(len(aa)):
       AA.append(aa[i])
Ejemplo n.º 30
0
import sys, fasta
 
target = open(sys.argv[1])
source = open(sys.argv[2])
lengths = []


k = int(sys.argv[3])
#make a query dictionary
kmer_source = {}

# put query in FASTA reader. spits out gene name and sequence
# when your cursor is within 0 to length (defined by k), 
### kmer is the sequence from cursor to cursor + 11 which defines the kmer)
### if the kmer string is not in the dictionary, add to dictionary 
for ident, sequence in fasta.FASTAReader(source):
    sequence = sequence.upper()
    for i in range(0, len(sequence)-k):
        kmer = sequence[i : i + k]
        if kmer not in kmer_source:
            kmer_source[kmer] = []

        kmer_source[kmer].append(i)

#make target dictionaries
## wh

kmer_position = {}
for ident, sequence in fasta.FASTAReader(target):
    sequence = sequence.upper()