Exemple #1
0
#!/usr/bin/env

import sys

from seqio import iteratorFromExtension
from nucio import fileIterator

if not len(sys.argv) == 2:
    sys.exit("sequencToLine.py in.{fa.fq}\n")

it = iteratorFromExtension(sys.argv[1])
for record in fileIterator(sys.argv[1], it):
    if hasattr(record, "desc"):
        print "\t".join([record.name, record.seq, record.desc, record.qual])
    else:
        print "\t".join([record.name, record.seq])

Exemple #2
0
    sys.exit("reference_segments.py in.fa")


store_table = {"A": [], "C": [], "G": [], "T": [], "N": []}

previous = 0
curr_count = 1
table = 2
accumulator = ["N", -1, store_table]


def runs(acc, nxt):
    prev_letter = acc[previous]
    acc[curr_count] += 1
    if not prev_letter == nxt:
        if acc[curr_count] > 0:
            acc[table][prev_letter].append(acc[curr_count])
        acc[previous] = nxt
        acc[curr_count] = 0

    return accumulator


for entry in fileIterator(sys.argv[1], fastaIterator):
    reduce(runs, entry.seq, accumulator)
    runs(accumulator, "X")  # get last sequence
    accumulator[previous] = "N"
    accumulator[curr_count] = -1

print max(store_table["N"])
Exemple #3
0
arguments = map(CLArgument._make, argument_list)

if not len(sys.argv) > 1:
    sys.exit(getHelpStr(description, arguments) + "\n")

(p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments)

if not len(args_remaining) >= 1:
    sys.exit(getHelpStr(description, arguments) + "\n")

title = args_remaining[0]
infiles = args_remaining[1:]

cellnames = map(lambda f: "_".join(f.split(".")[0].split("_")[:2]), infiles)

fit_gen = lambda filename: fileIterator(filename, lineItemIterator)
file_iterators = map(fit_gen, infiles)


def getBasesFromLineArr(arr):
    if not bool(arr):
        return
    if arr[0].startswith("n="):
        return arr[6].split("=")[1]
    if arr[0].startswith("#>%d" % p_arg_map["lengreater"]):
        return arr[1].split("=")[1]


def getCountsFromLineArr(arr):
    if not bool(arr):
        return
Exemple #4
0
def getRawAlignments(fn):
    '''fn is the filename
       returns An iterator over raw M4Records'''
    itemIterator = lambda f : lineRecordIterator(f, M4Record, M4RecordTypes)
    return fileIterator(fn, itemIterator)
Exemple #5
0
#Downsample a library
import sys

from nucio import typeify, fileIterator
from seqio import iteratorFromExtension, recordToString, seqlen



if not len(sys.argv) == 5:
    sys.exit("Usage: downsample.py genome_size desired_cov input.{fa,fq} output.{fa,fq}\n")


types = [int, float, str, str]
sysins = sys.argv[1:len(types)+1]
(genome_size, target_cov, infn, outfn) =  typeify(sysins,types)

max_bases = genome_size * target_cov 
total_bases = 0

with open(outfn, "w") as of:
    for record in fileIterator(infn,iteratorFromExtension(infn)):
        length = seqlen(record)
        if "N" in record.seq:
            continue
        if total_bases > max_bases:
            break
        of.write(recordToString(record))
        of.write("\n")
        total_bases += length
    
#!/usr/bin/env python

import sys

from seqio import iteratorFromExtension, recordToString
from nucio import fileIterator 
from misc import reverse_complement

if not len(sys.argv) == 2:
    sys.exit("reverseComplement.py in.{fa,fq}")

f = sys.argv[1]

for record in fileIterator(f,iteratorFromExtension(f)):
    print recordToString(record._replace(seq=reverse_complement(record.seq)))
Exemple #7
0
ref = itemgetter(7)
pos = lambda r : int(itemgetter(8)(r))
strand = itemgetter(9)

if not len(sys.argv) == 3:
    sys.exit("filterpairs.py read1.novo read2.novo\n")


names_eq = lambda name1,name2: name1.split("/")[0] == name2.split("/")[0]

filenames = sys.argv[1:3]
#filter out header lines
headfilt = lambda x : not x.startswith("#")
filt_lii = partial(lineItemIterator, filter_func=headfilt)

filt_fits = map( lambda fn : fileIterator(fn, filt_lii), filenames)

failrepeat = 0
failmapq = 0
failsameref = 0
total = 0
passed = 0
insertNotRF = []
insertRF = []


for read1,read2 in izip(*filt_fits):
    total += 1
    if not names_eq(name(read1), name(read2)):
        sys.exit("Error: %s not equal to %s\n" % (name(read1),name(read2)))
    
Exemple #8
0
                 ["miny", "miny", int, 0, "minimum y axis value"],
                 ["maxy", "maxy", int, -1, "maximmum y axis value"],
                 [
                     "out", "out", str, "out.pdf",
                     "output file name (default out.pdf)"
                 ], ["title", "title", str, "", "Title for graph"]]

arguments = map(CLArgument._make, argument_list)

(p_args, args_remaining) = parseArgs(sys.argv[1:], arguments)

if not len(args_remaining) == 1:
    sys.exit(getHelpStr(description, arguments) + "\n")

conv = lambda (i, j): (int(i), int(j))
(x, y) = zip(*imap(conv, fileIterator(args_remaining[0], lineItemIterator)))

pp = PdfPages(p_args["out"])

plt.plot(x, y)
(minx, maxx) = plt.xlim()
(miny, maxy) = plt.ylim()

minx = p_args["minx"] if p_args["minx"] > minx else minx
maxx = p_args["maxx"] if p_args["maxx"] > 0 else maxx
miny = p_args["miny"] if p_args["miny"] > miny else miny
maxy = p_args["maxy"] if p_args["maxy"] > 0 else maxy
plt.xlim((minx, maxx))
plt.ylim((miny, maxy))

plt.xlabel("Kmer Coverage")
Exemple #9
0
#!/usr/bin/env python

import sys

from itertools import imap

from seqio import iteratorFromExtension
from nucio import fileIterator


##Create Kmers

if not len(sys.argv) == 3:
    sys.exit("Usage: kmer.py k-size in.fa\n")

fn = sys.argv[2]
ksize = int(sys.argv[1])

for record in fileIterator(fn, iteratorFromExtension(fn)):
    seq = record.seq
    starts = range(len(seq)-ksize+1)
    kmers = imap(lambda start: seq[start:start+ksize], starts)
    for kmer in kmers:
        print kmer
Exemple #10
0
argument_list = [["span","span", argflag, False, "Only alignments that span the region"]]

arguments = map(CLArgument._make, argument_list)

(p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments)

if not len(args_remaining) == 2:
    sys.exit(getHelpStr(description,arguments) + "\n")


inm4 = args_remaining[0]
(chrom, rest) = args_remaining[1].split(":")
(start,end) = map(int,rest.split("-"))

it = lambda fh : lineRecordIterator(fh, M4Record, M4RecordTypes)

cond = lambda r : r.tname == chrom and not r.tend < start and not r.tstart > end
if p_arg_map["span"]:
    cond = lambda r : r.tname == chrom and r.tstart < start and r.tend > end

filt_records = ifilter(cond,fileIterator(inm4, it))

for r in imap(recordToString, filt_records):
    print r





Exemple #11
0
    ["minx","minx", int, 0,"minimum x axis value"],
    ["maxx", "maxx", int, -1,"maximum x axis value"],
    ["miny","miny", int, 0,"minimum y axis value"],
    ["maxy","maxy", int, -1,"maximmum y axis value"],
    ["out","out", str, "out.pdf", "output file name (default out.pdf)"],
    ["title","title", str, "", "Title for graph"]]

arguments = map(CLArgument._make, argument_list)

(p_args, args_remaining) = parseArgs(sys.argv[1:], arguments)

if not len(args_remaining) == 1:
    sys.exit(getHelpStr(description, arguments) + "\n")

conv = lambda (i,j) : (int(i),int(j))
(x,y) = zip(*imap(conv,fileIterator(args_remaining[0], lineItemIterator)))

pp = PdfPages(p_args["out"])

plt.plot(x,y)
(minx,maxx)=plt.xlim()
(miny,maxy)=plt.ylim()

minx = p_args["minx"] if p_args["minx"] > minx else minx
maxx = p_args["maxx"] if p_args["maxx"] > 0 else maxx
miny = p_args["miny"] if p_args["miny"] > miny else miny
maxy = p_args["maxy"] if p_args["maxy"] > 0 else maxy
plt.xlim((minx,maxx))
plt.ylim((miny,maxy))

plt.xlabel("Kmer Coverage")
Exemple #12
0
aqual = lambda r: int(itemgetter(6)(r))
ref = itemgetter(7)
pos = lambda r: int(itemgetter(8)(r))
strand = itemgetter(9)

if not len(sys.argv) == 3:
    sys.exit("filterpairs.py read1.novo read2.novo\n")

names_eq = lambda name1, name2: name1.split("/")[0] == name2.split("/")[0]

filenames = sys.argv[1:3]
#filter out header lines
headfilt = lambda x: not x.startswith("#")
filt_lii = partial(lineItemIterator, filter_func=headfilt)

filt_fits = map(lambda fn: fileIterator(fn, filt_lii), filenames)

failrepeat = 0
failmapq = 0
failsameref = 0
total = 0
passed = 0
insertNotRF = []
insertRF = []

for read1, read2 in izip(*filt_fits):
    total += 1
    if not names_eq(name(read1), name(read2)):
        sys.exit("Error: %s not equal to %s\n" % (name(read1), name(read2)))

    if not status(read1) == "U" or not status(read2) == "U":
if not len(sys.argv) == 2:
    sys.exit("reference_segments.py in.fa")

store_table = {'A': [], 'C': [], 'G': [], 'T': [], 'N': []}

previous = 0
curr_count = 1
table = 2
accumulator = ["N", -1, store_table]


def runs(acc, nxt):
    prev_letter = acc[previous]
    acc[curr_count] += 1
    if not prev_letter == nxt:
        if acc[curr_count] > 0:
            acc[table][prev_letter].append(acc[curr_count])
        acc[previous] = nxt
        acc[curr_count] = 0

    return accumulator


for entry in fileIterator(sys.argv[1], fastaIterator):
    reduce(runs, entry.seq, accumulator)
    runs(accumulator, 'X')  #get last sequence
    accumulator[previous] = 'N'
    accumulator[curr_count] = -1

print max(store_table['N'])
Exemple #14
0
def getRawAlignments(fn):
    '''fn is the filename
       returns An iterator over raw M4Records'''
    itemIterator = lambda f: lineRecordIterator(f, M4Record, M4RecordTypes)
    return fileIterator(fn, itemIterator)
Exemple #15
0
from args import parseArgs, getHelpStr, argflag, CLArgument

description = ("Usage: m4region.py [options] input.m4 chr:start-end\n"
               "Returns alignments that touch a region\n")

argument_list = [[
    "span", "span", argflag, False, "Only alignments that span the region"
]]

arguments = map(CLArgument._make, argument_list)

(p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments)

if not len(args_remaining) == 2:
    sys.exit(getHelpStr(description, arguments) + "\n")

inm4 = args_remaining[0]
(chrom, rest) = args_remaining[1].split(":")
(start, end) = map(int, rest.split("-"))

it = lambda fh: lineRecordIterator(fh, M4Record, M4RecordTypes)

cond = lambda r: r.tname == chrom and not r.tend < start and not r.tstart > end
if p_arg_map["span"]:
    cond = lambda r: r.tname == chrom and r.tstart < start and r.tend > end

filt_records = ifilter(cond, fileIterator(inm4, it))

for r in imap(recordToString, filt_records):
    print r
Exemple #16
0
if not len(sys.argv) > 1:
    sys.exit(getHelpStr(description, arguments) + "\n")

(p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments)


if not len(args_remaining) >= 1:
    sys.exit(getHelpStr(description, arguments) + "\n")

title = args_remaining[0]
infiles = args_remaining[1:]

cellnames = map(lambda f : "_".join(f.split(".")[0].split("_")[:2]), infiles)

fit_gen = lambda filename : fileIterator(filename, lineItemIterator)
file_iterators = map(fit_gen, infiles)

def getBasesFromLineArr(arr):
    if not bool(arr):
        return
    if arr[0].startswith("n="):
        return arr[6].split("=")[1]
    if arr[0].startswith("#>%d" % p_arg_map["lengreater"]):
        return arr[1].split("=")[1]

def getCountsFromLineArr(arr):
    if not bool(arr):
        return
    if arr[0].startswith("n="):
        return arr[0].split("=")[1]