def __init__(self, reads=False, path_to_baga=False): ''' Initialise with a baga.CollectData.Reads object or path to a previously save baga object of this class ''' assert bool(reads) ^ bool(path_to_baga), 'Instantiate with baga.CollectData.Reads or '\ 'the path to a previously saved baga.PreparedReads.Reads object' # xor if reads: try: self.read_files = reads.read_files except AttributeError: print('baga.PrepareReads.Reads needs a baga.CollectData.Reads object '\ 'with a "read_files" attribute. This can be obtained with the '\ '"getFromENA()" or "getFromPath()" methods.') else: try: loaded_baga = _cPickle.load(_gzip.open(path_to_baga, 'rb')) for attribute_name in dir(loaded_baga): if attribute_name[0] != '_': setattr(self, attribute_name, getattr(loaded_baga, attribute_name)) except IOError: print('Could not access {}'.format(file_name))
def saveLocal(self, name): ''' Save a downloaded read info to a local compressed pickle file. 'name' can exclude extension: .baga will be added ''' fileout = 'baga.CollectData.Reads-%s.baga' % name print('Saving to %s' % fileout) _cPickle.dump(self, _gzip.open(fileout, 'wb'))
def __init__(self, reads = False, path_to_baga = False): ''' Initialise with a baga.CollectData.Reads object or path to a previously save baga object of this class ''' assert bool(reads) ^ bool(path_to_baga), 'Instantiate with baga.CollectData.Reads or '\ 'the path to a previously saved baga.PreparedReads.Reads object' # xor if reads: try: self.read_files = reads.read_files except AttributeError: print('baga.PrepareReads.Reads needs a baga.CollectData.Reads object '\ 'with a "read_files" attribute. This can be obtained with the '\ '"getFromENA()" or "getFromPath()" methods.') else: try: loaded_baga = _cPickle.load(_gzip.open(path_to_baga,'rb')) for attribute_name in dir(loaded_baga): if attribute_name[0] != '_': setattr(self, attribute_name, getattr(loaded_baga, attribute_name)) except IOError: print('Could not access {}'.format(file_name))
def subsample(self, genome_size = 6601757, read_cov_depth = 80, pc_loss = 0.2, force = False, cov_closeness = 5): ''' Given the size in basepairs of a genome sequence, downsample fastq files to a desired average read coverage depth predicted after read alignment. Read lengths are taken from the file. By default, 20% are assumed to be lost at downstream quality control stages (e.g. quality score based trimming). The percent loss is used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent subsampling if within 5x coverage: avoids time consuming subsampling that will only make a small difference. ''' subsampled_read_files = {} start_time = _time.time() for cnum,(pairname,files) in enumerate(self.read_files.items()): processed_path_1 = insert_suffix(files[1], '_subsmp') processed_path_2 = insert_suffix(files[2], '_subsmp') if not all([_os.path.exists(processed_path_1), _os.path.exists(processed_path_2)]) \ or force: if files[1][-2:] == 'gz': fh1 = _gzip.open(files[1]) else: fh1 = open(files[1]) aread = _SeqIO.parse(fh1, 'fastq').next() read_len = len(aread.seq) print('Counting reads in %s' % files[1]) fh1.seek(0) lines = 0 # report per half million reads interval = 2000000 nextreport = interval for line in fh1: lines += 1 if lines == nextreport: print('{:,} reads'.format(lines/4)) nextreport += interval totalreads = lines / 4.0 print('Found %s reads' % totalreads) full_depth_coverage = read_len * 2 * totalreads * (1 - pc_loss) / genome_size print('These paired read files would provide approximately {:.1f}x coverage depth'.format(full_depth_coverage)) numreads2keep = int( round(genome_size * read_cov_depth / (read_len * 2) / (1 - pc_loss), 0) ) if numreads2keep >= totalreads: print('This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.'.format(full_depth_coverage, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue elif full_depth_coverage < read_cov_depth + cov_closeness: print('This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.'.format(full_depth_coverage, cov_closeness, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue else: print('For approximately {}x read coverage, will retain {} of {} {}bp read pairs'.format( read_cov_depth, numreads2keep, totalreads, read_len)) fh1.seek(0) if files[2][-2:] == 'gz': fh2 = _gzip.open(files[2]) else: fh2 = open(files[2]) fout1 = _gzip.open(processed_path_1, 'wb') fout2 = _gzip.open(processed_path_2, 'wb') batch_size = 200000 keep_per_pop = int(numreads2keep / float(totalreads) * batch_size) + 1 nextwrite = batch_size written = 0 n1 = 0 n2 = 0 these_lines1 = [] these_lines2 = [] reportfreq = 10 thisreport = 0 print('Subsampling . . .') for line in fh1: these_lines1 += [line] if len(these_lines1) % 4 == 0: n1 += 1 if n1 == nextwrite: keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) these_lines1 = [] written += keep_per_pop thisreport += 1 if thisreport == reportfreq or written == keep_per_pop: # report first time and at intevals print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_1)) for line2 in fh2: these_lines2 += [line2] if len(these_lines2) % 4 == 0: n2 += 1 if n2 == nextwrite: keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout2.write(''.join(keep_these)) these_lines2 = [] if thisreport == reportfreq or written == keep_per_pop: thisreport = 0 print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_2)) nextwrite += batch_size break # write remainder remainder = nextwrite - n1 keep_in_remainder = int(keep_per_pop * (remainder / float(batch_size))) + 1 keep_indices = sorted(_sample(xrange(remainder), keep_in_remainder)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) written += keep_in_remainder print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_1)) # get remainder for line2 in fh2: these_lines2 += [line2] # write remainder keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') ###### check why keep_these was empty fout2.write(''.join(keep_these)) print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_2)) # not sure if this is quicker/slower (more calls to .join()) # this_read = [] # for line in fh1: # this_read += [line] # if len(this_read) == 4: # these_reads1 += [''.join(this_read)] # #these_reads1 += this_read # this_read = [] # n1 += 1 # if n1 == nextwrite: # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop)) # # try parsing a read for QC # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq') # fout1.write(''.join([these_reads1[i] for i in keep_indices])) # these_reads1 = [] # written += keep_per_pop # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_1)) # for line2 in fh2: # this_read += [line2] # if len(this_read) == 4: # these_reads2 += [''.join(this_read)] # this_read = [] # n2 += 1 # if n2 == nextwrite: # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq') # fout2.write(''.join([these_reads2[i] for i in keep_indices])) # these_reads2 = [] # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_2)) # nextwrite += batch_size # break fout1.close() fout2.close() fh1.close() fh2.close() else: print('Found:') print(processed_path_1) print(processed_path_2) print('use "force = True" to overwrite') if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = processed_path_1 subsampled_read_files[pairname][2] = processed_path_2 # replace here as this step is optional self.fullsized_read_files = list(self.read_files) self.read_files = subsampled_read_files
def subsample(self, genome_size=6601757, read_cov_depth=80, pc_loss=0.2, force=False, cov_closeness=5): ''' Given the size in basepairs of a genome sequence, downsample fastq files to a desired average read coverage depth predicted after read alignment. Read lengths are taken from the file. By default, 20% are assumed to be lost at downstream quality control stages (e.g. quality score based trimming). The percent loss is used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent subsampling if within 5x coverage: avoids time consuming subsampling that will only make a small difference. ''' subsampled_read_files = {} start_time = _time.time() for cnum, (pairname, files) in enumerate(self.read_files.items()): processed_path_1 = insert_suffix(files[1], '_subsmp') processed_path_2 = insert_suffix(files[2], '_subsmp') if not all([_os.path.exists(processed_path_1), _os.path.exists(processed_path_2)]) \ or force: if files[1][-2:] == 'gz': fh1 = _gzip.open(files[1]) else: fh1 = open(files[1]) aread = _SeqIO.parse(fh1, 'fastq').next() read_len = len(aread.seq) print('Counting reads in %s' % files[1]) fh1.seek(0) lines = 0 # report per half million reads interval = 2000000 nextreport = interval for line in fh1: lines += 1 if lines == nextreport: print('{:,} reads'.format(lines / 4)) nextreport += interval totalreads = lines / 4.0 print('Found %s reads' % totalreads) full_depth_coverage = read_len * 2 * totalreads * ( 1 - pc_loss) / genome_size print( 'These paired read files would provide approximately {:.1f}x coverage depth' .format(full_depth_coverage)) numreads2keep = int( round( genome_size * read_cov_depth / (read_len * 2) / (1 - pc_loss), 0)) if numreads2keep >= totalreads: print( 'This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.' .format(full_depth_coverage, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue elif full_depth_coverage < read_cov_depth + cov_closeness: print( 'This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.' .format(full_depth_coverage, cov_closeness, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue else: print( 'For approximately {}x read coverage, will retain {} of {} {}bp read pairs' .format(read_cov_depth, numreads2keep, totalreads, read_len)) fh1.seek(0) if files[2][-2:] == 'gz': fh2 = _gzip.open(files[2]) else: fh2 = open(files[2]) fout1 = _gzip.open(processed_path_1, 'wb') fout2 = _gzip.open(processed_path_2, 'wb') batch_size = 200000 keep_per_pop = int( numreads2keep / float(totalreads) * batch_size) + 1 nextwrite = batch_size written = 0 n1 = 0 n2 = 0 these_lines1 = [] these_lines2 = [] reportfreq = 10 thisreport = 0 print('Subsampling . . .') for line in fh1: these_lines1 += [line] if len(these_lines1) % 4 == 0: n1 += 1 if n1 == nextwrite: keep_indices = sorted( _sample(xrange(batch_size), keep_per_pop)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read( _StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) these_lines1 = [] written += keep_per_pop thisreport += 1 if thisreport == reportfreq or written == keep_per_pop: # report first time and at intevals print( 'Written {:,} reads ({:.1%}) to {}'.format( written, written / float(numreads2keep), processed_path_1)) for line2 in fh2: these_lines2 += [line2] if len(these_lines2) % 4 == 0: n2 += 1 if n2 == nextwrite: keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read( _StringIO(''.join(keep_these[:4])), 'fastq') fout2.write(''.join(keep_these)) these_lines2 = [] if thisreport == reportfreq or written == keep_per_pop: thisreport = 0 print( 'Written {:,} reads ({:.1%}) to {}' .format( written, written / float(numreads2keep), processed_path_2)) nextwrite += batch_size break # write remainder remainder = nextwrite - n1 keep_in_remainder = int( keep_per_pop * (remainder / float(batch_size))) + 1 keep_indices = sorted( _sample(xrange(remainder), keep_in_remainder)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) written += keep_in_remainder print('Written {:,} reads ({:.1%}) to {}'.format( written, written / float(numreads2keep), processed_path_1)) # get remainder for line2 in fh2: these_lines2 += [line2] # write remainder keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read( _StringIO(''.join(keep_these[:4])), 'fastq') ###### check why keep_these was empty fout2.write(''.join(keep_these)) print('Written {:,} reads ({:.1%}) to {}'.format( written, written / float(numreads2keep), processed_path_2)) # not sure if this is quicker/slower (more calls to .join()) # this_read = [] # for line in fh1: # this_read += [line] # if len(this_read) == 4: # these_reads1 += [''.join(this_read)] # #these_reads1 += this_read # this_read = [] # n1 += 1 # if n1 == nextwrite: # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop)) # # try parsing a read for QC # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq') # fout1.write(''.join([these_reads1[i] for i in keep_indices])) # these_reads1 = [] # written += keep_per_pop # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_1)) # for line2 in fh2: # this_read += [line2] # if len(this_read) == 4: # these_reads2 += [''.join(this_read)] # this_read = [] # n2 += 1 # if n2 == nextwrite: # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq') # fout2.write(''.join([these_reads2[i] for i in keep_indices])) # these_reads2 = [] # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_2)) # nextwrite += batch_size # break fout1.close() fout2.close() fh1.close() fh2.close() else: print('Found:') print(processed_path_1) print(processed_path_2) print('use "force = True" to overwrite') if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = processed_path_1 subsampled_read_files[pairname][2] = processed_path_2 # replace here as this step is optional self.fullsized_read_files = list(self.read_files) self.read_files = subsampled_read_files