def file_shuffler(infwd, inrev, outfile): filetype = fasta_or_fastq(infwd) if filetype == 'unknown': sys.exit('File type not recognised' + infwd) lineskip = {'fasta': 2, 'fastq': 4}[filetype] f_infwd = mh12_utils.open_file_read(infwd) f_inrev = mh12_utils.open_file_read(inrev) f_out = mh12_utils.open_file_write(outfile) line_fwd = f_infwd.readline() line_rev = f_inrev.readline() while line_fwd: for i in range(lineskip): f_out.write(line_fwd) line_fwd = f_infwd.readline() for i in range(lineskip): f_out.write(line_rev) line_rev = f_inrev.readline() f_infwd.close() f_inrev.close() f_out.close()
def fasta2singleLine(infile, outfile): overwrite = False if infile == outfile: overwrite = True outfile = outfile + '-tmp-' + str(random.randint(0, 1000000)) if infile.endswith('.gz'): outfile = outfile + '.gz' if os.path.exists(outfile): sys.exit('Yowzer! Unlikely to happen, but ' + outfile + ' already exists. Aborting') regex = re.compile('^\d') fasta = True # determine if it's fasta or a fasta.qual file f_in = mh12_utils.open_file_read(infile) f_in.readline() tmp = f_in.readline() if regex.match(tmp): fasta = False f_in.close() f_in = mh12_utils.open_file_read(infile) f_out = mh12_utils.open_file_write(outfile) first = True for line in f_in: if line.startswith("\n"): continue elif line.startswith(">"): if not first: f_out.write("\n") else: first = False f_out.write(line) else: f_out.write(line.rstrip()) if not fasta: f_out.write(' ') f_out.write("\n") f_in.close() f_out.close() if overwrite: os.rename(outfile, infile)
def fastn2subset(infile, ids, outfile, complement=False, start_only=False): filetype = fasta_or_fastq(infile) if filetype == 'unknown': sys.exit('File ' + infile + ' not recognised as a fasta/q') f_in = mh12_utils.open_file_read(infile) f_out = mh12_utils.open_file_write(outfile) while 1: seq = get_next_seq_from_file(f_in, filetype) if not seq: break if start_only == '': seq.id = seq.id.split()[0] elif start_only != False: seq.id = seq.id.split(start_only)[0] if (not complement and seq.id in ids) or (complement and seq.id not in ids): print >> f_out, seq f_in.close() f_out.close()
def fastn_splitter(fname, outprefix): file_list = [] filetype = fasta_or_fastq(fname) if filetype == 'unknown': sys.exit('Unknown file format of ' + fname + ' in method fastn.fastn_splitter') f_in = mh12_utils.open_file_read(fname) # loop through file, writing each sequence to new file while 1: seq = get_next_seq_from_file(f_in, filetype) if not seq: break outname = outprefix + seq.id.replace(' ', '_')[1:] + '.' + filetype f_out = mh12_utils.open_file_write(outname) print >> f_out, seq f_out.close() file_list.append(outname) f_in.close() return file_list
def fastn2lengthdic(fname, d, min_length=1, ignoreN=False, first_only=False): filetype = fasta_or_fastq(fname) if filetype == "unknown": sys.exit("Unknown file format of " + fname + " in method mh12_utils.fastn2lengthdic") f = mh12_utils.open_file_read(fname) while 1: seq = get_next_seq_from_file(f, filetype) if not seq: break if len(seq) < min_length: continue if ignoreN: seq.seq = seq.seq.replace('N', '') seq.seq = seq.seq.replace('n', '') if first_only: seq.id = seq.id.split()[0] d[seq.id] = len(seq) f.close()
def fai2length_hash(filename, d): f = mh12_utils.open_file_read(filename) for line in f: tmp = line.split() d[tmp[0]] = int(tmp[1]) f.close()
def fasta_or_fastq(filename): f = mh12_utils.open_file_read(filename) x = f.readline() f.close() if x.startswith('>'): return 'fasta' elif x.startswith('@'): return 'fastq' else: return 'unknown'
def fastn2dictionary(fname, d, first_only=False): filetype = fasta_or_fastq(fname) f_in = mh12_utils.open_file_read(fname) while 1: seq = get_next_seq_from_file(f_in, filetype) if not seq: break if first_only: seq.id = seq.id.split()[0] d[seq.id] = seq f_in.close()
def fasta2multiline(infile, outfile, line_length): if infile == outfile: sys.exit('infile = outfile in fastn.fasta2multiline. Aborting') f_in = mh12_utils.open_file_read(infile) f_out = mh12_utils.open_file_write(outfile) while 1: seq = get_next_seq_from_file(f_in, 'fasta') if not seq: break print >> f_out, seq.multi_line_str(line_length) f_in.close() f_out.close()
def fasta_singleline_ok(filename): line_count = 0 seq_count = 0 f = mh12_utils.open_file_read(filename) for line in f: line_count += 1 if line.startswith('>'): seq_count += 1 if 2 * (seq_count - 1) != line_count - 1: return False f.close() return True
def fastn2uniq(infile, outfile): filetype = fasta_or_fastq(infile) reads = set() f_in = mh12_utils.open_file_read(infile) f_out = mh12_utils.open_file_write(outfile) while 1: seq = get_next_seq_from_file(f_in, filetype) if not seq: break if seq.id not in reads: reads.add(seq.id) print >> f_out, seq f_in.close() f_out.close()
def fastn_split(fname, outprefix, no_of_bases): # writes array of sequences to a file def write_file(fname, a): f = mh12_utils.open_file_write(fname) for seq in a: print >> f, seq f.close() filetype = fasta_or_fastq(fname) if filetype == 'unknown': sys.exit('Unknown file format of ' + fname + ' in function fastn.fastn_split') f_in = mh12_utils.open_file_read(fname) file_counter = 1 seqs = [get_next_seq_from_file(f_in, filetype)] base_counter = len(seqs[0]) # loop through file, writing out files when enough data gathered while 1: next_seq = get_next_seq_from_file(f_in, filetype) if not next_seq: write_file(outprefix + str(file_counter) + '.' + filetype, seqs) break # either write out sequences, or add the next one to array if base_counter + len(next_seq) >= no_of_bases: write_file(outprefix + str(file_counter) + '.' + filetype, seqs) seqs = [next_seq] base_counter = next_seq.length() file_counter += 1 else: seqs.append(next_seq) base_counter += next_seq.length() f_in.close() return file_counter
if len(parser.rargs) != 3: parser.print_help() sys.exit(1) options.infile = parser.rargs[0] options.txtout = parser.rargs[1] options.plotout = parser.rargs[2] filetype = fastn.fasta_or_fastq(options.infile) if filetype == 'unknown': sys.exit('File ' + infile + ' not recognised as a fasta/q') gc_hist = dict(zip(range(101), [0] * 101)) f_in = mh12_utils.open_file_read(options.infile) f_out = mh12_utils.open_file_write(options.txtout) while 1: seq = fastn.get_next_seq_from_file(f_in, filetype) if not seq: break if options.window: i = 0 while i < len(seq): tmp = fastn.Fasta(seq.id, seq.seq[i:i + options.window]) gc = tmp.gc() gc_hist[floor(gc)] += 1