Exemple #1
0
#!/usr/bin/env

import sys

from seqio import iteratorFromExtension
from nucio import fileIterator

if not len(sys.argv) == 2:
    sys.exit("sequencToLine.py in.{fa.fq}\n")

it = iteratorFromExtension(sys.argv[1])
for record in fileIterator(sys.argv[1], it):
    if hasattr(record, "desc"):
        print "\t".join([record.name, record.seq, record.desc, record.qual])
    else:
        print "\t".join([record.name, record.seq])

#!/usr/bin/env python

import sys

from seqio import iteratorFromExtension, recordToString
from nucio import fileIterator 
from misc import reverse_complement

if not len(sys.argv) == 2:
    sys.exit("reverseComplement.py in.{fa,fq}")

f = sys.argv[1]

for record in fileIterator(f,iteratorFromExtension(f)):
    print recordToString(record._replace(seq=reverse_complement(record.seq)))
Exemple #3
0
#Downsample a library
import sys

from nucio import typeify, fileIterator
from seqio import iteratorFromExtension, recordToString, seqlen



if not len(sys.argv) == 5:
    sys.exit("Usage: downsample.py genome_size desired_cov input.{fa,fq} output.{fa,fq}\n")


types = [int, float, str, str]
sysins = sys.argv[1:len(types)+1]
(genome_size, target_cov, infn, outfn) =  typeify(sysins,types)

max_bases = genome_size * target_cov 
total_bases = 0

with open(outfn, "w") as of:
    for record in fileIterator(infn,iteratorFromExtension(infn)):
        length = seqlen(record)
        if "N" in record.seq:
            continue
        if total_bases > max_bases:
            break
        of.write(recordToString(record))
        of.write("\n")
        total_bases += length
    
Exemple #4
0
#!/usr/bin/env python

import sys

from itertools import imap

from seqio import iteratorFromExtension
from nucio import fileIterator


##Create Kmers

if not len(sys.argv) == 3:
    sys.exit("Usage: kmer.py k-size in.fa\n")

fn = sys.argv[2]
ksize = int(sys.argv[1])

for record in fileIterator(fn, iteratorFromExtension(fn)):
    seq = record.seq
    starts = range(len(seq)-ksize+1)
    kmers = imap(lambda start: seq[start:start+ksize], starts)
    for kmer in kmers:
        print kmer