def file_shuffler(infwd, inrev, outfile): filetype = fasta_or_fastq(infwd) if filetype == 'unknown': sys.exit('File type not recognised' + infwd) lineskip = {'fasta': 2, 'fastq': 4}[filetype] f_infwd = mh12_utils.open_file_read(infwd) f_inrev = mh12_utils.open_file_read(inrev) f_out = mh12_utils.open_file_write(outfile) line_fwd = f_infwd.readline() line_rev = f_inrev.readline() while line_fwd: for i in range(lineskip): f_out.write(line_fwd) line_fwd = f_infwd.readline() for i in range(lineskip): f_out.write(line_rev) line_rev = f_inrev.readline() f_infwd.close() f_inrev.close() f_out.close()
def write_file(fname, a): f = mh12_utils.open_file_write(fname) for seq in a: print >> f, seq f.close()
def fastn2subset(infile, ids, outfile, complement=False, start_only=False): filetype = fasta_or_fastq(infile) if filetype == 'unknown': sys.exit('File ' + infile + ' not recognised as a fasta/q') f_in = mh12_utils.open_file_read(infile) f_out = mh12_utils.open_file_write(outfile) while 1: seq = get_next_seq_from_file(f_in, filetype) if not seq: break if start_only == '': seq.id = seq.id.split()[0] elif start_only != False: seq.id = seq.id.split(start_only)[0] if (not complement and seq.id in ids) or (complement and seq.id not in ids): print >> f_out, seq f_in.close() f_out.close()
def fastn_splitter(fname, outprefix): file_list = [] filetype = fasta_or_fastq(fname) if filetype == 'unknown': sys.exit('Unknown file format of ' + fname + ' in method fastn.fastn_splitter') f_in = mh12_utils.open_file_read(fname) # loop through file, writing each sequence to new file while 1: seq = get_next_seq_from_file(f_in, filetype) if not seq: break outname = outprefix + seq.id.replace(' ', '_')[1:] + '.' + filetype f_out = mh12_utils.open_file_write(outname) print >> f_out, seq f_out.close() file_list.append(outname) f_in.close() return file_list
def fasta2singleLine(infile, outfile): overwrite = False if infile == outfile: overwrite = True outfile = outfile + '-tmp-' + str(random.randint(0, 1000000)) if infile.endswith('.gz'): outfile = outfile + '.gz' if os.path.exists(outfile): sys.exit('Yowzer! Unlikely to happen, but ' + outfile + ' already exists. Aborting') regex = re.compile('^\d') fasta = True # determine if it's fasta or a fasta.qual file f_in = mh12_utils.open_file_read(infile) f_in.readline() tmp = f_in.readline() if regex.match(tmp): fasta = False f_in.close() f_in = mh12_utils.open_file_read(infile) f_out = mh12_utils.open_file_write(outfile) first = True for line in f_in: if line.startswith("\n"): continue elif line.startswith(">"): if not first: f_out.write("\n") else: first = False f_out.write(line) else: f_out.write(line.rstrip()) if not fasta: f_out.write(' ') f_out.write("\n") f_in.close() f_out.close() if overwrite: os.rename(outfile, infile)
def fasta2multiline(infile, outfile, line_length): if infile == outfile: sys.exit('infile = outfile in fastn.fasta2multiline. Aborting') f_in = mh12_utils.open_file_read(infile) f_out = mh12_utils.open_file_write(outfile) while 1: seq = get_next_seq_from_file(f_in, 'fasta') if not seq: break print >> f_out, seq.multi_line_str(line_length) f_in.close() f_out.close()
def fastn2uniq(infile, outfile): filetype = fasta_or_fastq(infile) reads = set() f_in = mh12_utils.open_file_read(infile) f_out = mh12_utils.open_file_write(outfile) while 1: seq = get_next_seq_from_file(f_in, filetype) if not seq: break if seq.id not in reads: reads.add(seq.id) print >> f_out, seq f_in.close() f_out.close()
parser.print_help() sys.exit(1) options.infile = parser.rargs[0] options.txtout = parser.rargs[1] options.plotout = parser.rargs[2] filetype = fastn.fasta_or_fastq(options.infile) if filetype == 'unknown': sys.exit('File ' + infile + ' not recognised as a fasta/q') gc_hist = dict(zip(range(101), [0] * 101)) f_in = mh12_utils.open_file_read(options.infile) f_out = mh12_utils.open_file_write(options.txtout) while 1: seq = fastn.get_next_seq_from_file(f_in, filetype) if not seq: break if options.window: i = 0 while i < len(seq): tmp = fastn.Fasta(seq.id, seq.seq[i:i + options.window]) gc = tmp.gc() gc_hist[floor(gc)] += 1 print >> f_out, seq.id, str(i + 1), gc