Example #1
0
argument_list = [["maxlen", "maxlen", int, -1, "Maximum length of reads"]]

arguments = map(CLArgument._make, argument_list)

if not len(sys.argv) > 2:
    sys.exit(getHelpStr(description, arguments) + "\n")

(p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments)


minlen = int(args_remaining[0])

files = args_remaining[1:]

if not all(map(os.path.exists, files)):
    sys.exit("Not all files exist")

file_readers = starmap(fileIterator, zip(files, map(iteratorFromExtension, files)))

filt_cond = lambda record: seqlen(record) > minlen
if p_arg_map["maxlen"] > 1:
    filt_cond = lambda record: seqlen(record) > minlen and seqlen(record) <= p_arg_map["maxlen"]

filtered_records = ifilter(filt_cond, chain.from_iterable(file_readers))

filtered_seqs = imap(recordToString, filtered_records)

for seq in filtered_seqs:
    print seq
Example #2
0
arguments = map(CLArgument._make, argument_list)

if not len(sys.argv) > 2:
    sys.exit(getHelpStr(description, arguments) +"\n")

(p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments)


minlen = int(args_remaining[0])

files = args_remaining[1:]

if not all(map(os.path.exists,files)):
    sys.exit("Not all files exist")

file_readers = starmap(fileIterator, zip(files, map(iteratorFromExtension, files)))

filt_cond = lambda record : seqlen(record) > minlen
if p_arg_map["maxlen"] > 1:
    filt_cond = lambda record: seqlen(record) > minlen and seqlen(record) <= p_arg_map["maxlen"]

filtered_records = ifilter(filt_cond, chain.from_iterable(file_readers))

filtered_seqs = imap(recordToString, filtered_records)

for seq in filtered_seqs:
    print seq


Example #3
0
#Downsample a library
import sys

from nucio import typeify, fileIterator
from seqio import iteratorFromExtension, recordToString, seqlen



if not len(sys.argv) == 5:
    sys.exit("Usage: downsample.py genome_size desired_cov input.{fa,fq} output.{fa,fq}\n")


types = [int, float, str, str]
sysins = sys.argv[1:len(types)+1]
(genome_size, target_cov, infn, outfn) =  typeify(sysins,types)

max_bases = genome_size * target_cov 
total_bases = 0

with open(outfn, "w") as of:
    for record in fileIterator(infn,iteratorFromExtension(infn)):
        length = seqlen(record)
        if "N" in record.seq:
            continue
        if total_bases > max_bases:
            break
        of.write(recordToString(record))
        of.write("\n")
        total_bases += length
    
Example #4
0
iterators = map(iteratorFromExtension, imap(itemgetter(1), openers))
openfuncs = map(defdef(open), imap(itemgetter(0),openers))

input_data = chain.from_iterable(starmap(fileIterator,
                                         zip(in_files, iterators, openfuncs)))
total_reads = 0
dnum = 0
fnum = 0
fh = None
readidx_fh = open("ReadIndex.txt", "w")

recordString = recordToString if p_arg_map["samefmt"] else fastaRecordToString

for record in input_data:

    if seqlen(record) < p_arg_map["minlen"]:
        continue

    if total_reads % rpf == 0:
        if total_reads % (rpf * fpd) == 0:
            dnum += 1
            fnum = 0
            os.mkdir(pstr(dnum))
        fnum += 1
        if fh:
            fh.close()
        current_file ="%s/p%s" % (pstr(dnum),pstr(fnum))
        fh = open(current_file, "w") 

    clean_name = str(record.name).split()[0]
    clean_record = record._replace(name=clean_name)