def subtract_mouse_reads( summary_file, in_fastq, out_fastq, sub_fastq, num_mismatches): # Accept this as a mouse read if it contains less than or equal to # num_mismatches mismatches from the mouse genome. from genomicode import filelib from genomicode import genomelib # List the reads that look like mouse. mouse_reads = {} for d in filelib.read_row(summary_file, header=1): if not d.NM: # ignore missing alignments continue if int(d.NM) <= num_mismatches: mouse_reads[d.query_name] = 1 outhandle = open(out_fastq, 'w') subhandle = open(sub_fastq, 'w') for x in genomelib.read_fastq(in_fastq): title, sequence, quality = x x = title if x.startswith("@"): x = x[1:] x = x.split()[0] # BAM file only contains the first part. if x in mouse_reads: genomelib.write_fastq(title, sequence, quality, subhandle) else: genomelib.write_fastq(title, sequence, quality, outhandle)
def copy_fastq_file(in_filename, out_filename, num_samples): from genomicode import genomelib outhandle = open(out_filename, 'w') for i, x in enumerate(genomelib.read_fastq(in_filename)): if i >= num_samples: break genomelib.write_fastq(*x, handle=outhandle)
def copy_fastq(in_filename, out_filename, MAX_READS=None): from genomicode import genomelib in_iter = genomelib.read_fastq(in_filename) out_handle = open(out_filename, 'w') i = 0 while MAX_READS is None or i < MAX_READS: i += 1 x = in_iter.next() if not x: # no more reads assert i # make sure at least 1 read break genomelib.write_fastq(*x, **{"handle" : out_handle})