def getFromENA(self, run_acc_list, ftp_server_url = 'ftp.sra.ebi.ac.uk', local_reads_path = ['reads']): ''' Given a list of 'run' accession numbers for paired end short read analyses, download the read files from the European Nucleotide Archive. If using a mirror server, supply an alternative for 'ftp_server_url'. 'local_reads_path' can be a path string or list or folder names. ''' if isinstance(local_reads_path, list): local_reads_path = _os.path.sep.join(local_reads_path) if not _os.path.exists(local_reads_path): _os.makedirs(local_reads_path) print('Logging in to %s' % ftp_server_url) ftp = _FTP(ftp_server_url) # anonymous login print(ftp.login()) def check_connection(ftp): try: print('FTP: %s' % ftp.voidcmd("NOOP")) # http://docs.python.org/2/library/ftplib.html return(True) except IOError as e: print('Seems to be a problem with the connection to FTP server:') print('I/O error({0}): {1}'.format(e.errno, e.strerror) ) return(False) def calc_checksum(filepath): hasher = _md5() handle = open(filepath, 'rb') buff = handle.read(65536) while len(buff) > 0: hasher.update(buff) buff = handle.read(65536) return(hasher.hexdigest()) downloaded_read_files = {} start_time = _time.time() failed = [] for cnum,run_acc in enumerate(run_acc_list): query_url_base = 'http://www.ebi.ac.uk/ena/data/warehouse/search?query=' success = False tries = 0 max_tries = 5 while not success: rest_req = '"run_accession=%s"&result=read_run&fields=fastq_ftp,fastq_md5&display=report' % run_acc print('Sending query to ENA:\n%s' % rest_req) result = _urllib2.urlopen(query_url_base + rest_req).read() print('ENA accession numbers query result:\n%s' % result) if result.count('ERR') == 7: success = True else: print('Query result from ENA was unexpected on attempt %s of %s' % (tries, max_tries)) _time.sleep(0.5) tries += 1 if tries == max_tries: print('Attempt %s failed. Try again later and if problem persists, report bug.' % tries) failed += [run_acc] break #_sys.exit(1) if not success: continue md5s = result.split('\n')[-2].split('\t')[-1][:-1].split(';') ENA_paths = result.split('\n')[-2].split('\t')[-2][:-1].split(';') ENA_reads_pair_paths = {} ENA_reads_pair_paths[1] = ENA_paths[0].replace(ftp_server_url, '') ENA_reads_pair_paths[2] = ENA_paths[1].replace(ftp_server_url, '') local_reads_pair_paths = {} local_reads_pair_paths[1] = local_reads_path + \ _os.path.sep + \ ENA_reads_pair_paths[1].split('/')[-1] local_reads_pair_paths[2] = local_reads_path + \ _os.path.sep + \ ENA_reads_pair_paths[2].split('/')[-1] downloaded_read_files[run_acc] = {} for f in (1,2): # ensure connection is still open while not check_connection(ftp): _sleep(0.5) print('Attempting to re-establish connection . . .') ftp = _FTP(ftp_server_url) # anonymous login print(ftp.login()) pass expected_checksum = md5s[f - 1] exists = _os.path.exists(local_reads_pair_paths[f]) if exists: print('File %s for %s exists locally: %s' % (f, run_acc, local_reads_pair_paths[f])) actual_checksum = calc_checksum(local_reads_pair_paths[f]) if actual_checksum == expected_checksum: print('File checksum matches: %s. Skipping download' % (expected_checksum)) downloaded_read_files[run_acc][f] = local_reads_pair_paths[f] continue else: print('Checksum mismatch') print('Downloading via %s: %s' % (ftp_server_url, ENA_reads_pair_paths[f])) res = ftp.retrbinary('RETR %s' % ENA_reads_pair_paths[f], open(local_reads_pair_paths[f], 'wb').write) print('FTP: %s' % res) print('Calculating checksum . . .') actual_checksum = calc_checksum(local_reads_pair_paths[f]) if actual_checksum == expected_checksum: print('File checksum matches: %s.' % (expected_checksum)) downloaded_read_files[run_acc][f] = local_reads_pair_paths[f] else: print('Checksum mismatch for: %s') if len(run_acc_list) > 1: # report durations, time left etc _report_time(start_time, cnum, len(run_acc_list)) if len(failed) > 0: print('WARNING: some accession numbers did not return a result from ENA') print('Try searching http://www.ebi.ac.uk/ena in a web-browser for:') print(', '.join(failed)) self.read_files = downloaded_read_files
def getFromENA(self, run_acc_list, ftp_server_url='ftp.sra.ebi.ac.uk', local_reads_path=['reads']): ''' Given a list of 'run' accession numbers for paired end short read analyses, download the read files from the European Nucleotide Archive. If using a mirror server, supply an alternative for 'ftp_server_url'. 'local_reads_path' can be a path string or list or folder names. ''' if isinstance(local_reads_path, list): local_reads_path = _os.path.sep.join(local_reads_path) if not _os.path.exists(local_reads_path): _os.makedirs(local_reads_path) print('Logging in to %s' % ftp_server_url) ftp = _FTP(ftp_server_url) # anonymous login print(ftp.login()) def check_connection(ftp): try: print('FTP: %s' % ftp.voidcmd("NOOP")) # http://docs.python.org/2/library/ftplib.html return (True) except IOError as e: print( 'Seems to be a problem with the connection to FTP server:') print('I/O error({0}): {1}'.format(e.errno, e.strerror)) return (False) def calc_checksum(filepath): hasher = _md5() handle = open(filepath, 'rb') buff = handle.read(65536) while len(buff) > 0: hasher.update(buff) buff = handle.read(65536) return (hasher.hexdigest()) downloaded_read_files = {} start_time = _time.time() failed = [] for cnum, run_acc in enumerate(run_acc_list): query_url_base = 'http://www.ebi.ac.uk/ena/data/warehouse/search?query=' success = False tries = 0 max_tries = 5 while not success: rest_req = '"run_accession=%s"&result=read_run&fields=fastq_ftp,fastq_md5&display=report' % run_acc print('Sending query to ENA:\n%s' % rest_req) result = _urllib2.urlopen(query_url_base + rest_req).read() print('ENA accession numbers query result:\n%s' % result) if result.count('ERR') == 7: success = True else: print( 'Query result from ENA was unexpected on attempt %s of %s' % (tries, max_tries)) _time.sleep(0.5) tries += 1 if tries == max_tries: print( 'Attempt %s failed. Try again later and if problem persists, report bug.' % tries) failed += [run_acc] break #_sys.exit(1) if not success: continue md5s = result.split('\n')[-2].split('\t')[-1][:-1].split(';') ENA_paths = result.split('\n')[-2].split('\t')[-2][:-1].split(';') ENA_reads_pair_paths = {} ENA_reads_pair_paths[1] = ENA_paths[0].replace(ftp_server_url, '') ENA_reads_pair_paths[2] = ENA_paths[1].replace(ftp_server_url, '') local_reads_pair_paths = {} local_reads_pair_paths[1] = local_reads_path + \ _os.path.sep + \ ENA_reads_pair_paths[1].split('/')[-1] local_reads_pair_paths[2] = local_reads_path + \ _os.path.sep + \ ENA_reads_pair_paths[2].split('/')[-1] downloaded_read_files[run_acc] = {} for f in (1, 2): # ensure connection is still open while not check_connection(ftp): _sleep(0.5) print('Attempting to re-establish connection . . .') ftp = _FTP(ftp_server_url) # anonymous login print(ftp.login()) pass expected_checksum = md5s[f - 1] exists = _os.path.exists(local_reads_pair_paths[f]) if exists: print('File %s for %s exists locally: %s' % (f, run_acc, local_reads_pair_paths[f])) actual_checksum = calc_checksum(local_reads_pair_paths[f]) if actual_checksum == expected_checksum: print('File checksum matches: %s. Skipping download' % (expected_checksum)) downloaded_read_files[run_acc][ f] = local_reads_pair_paths[f] continue else: print('Checksum mismatch') print('Downloading via %s: %s' % (ftp_server_url, ENA_reads_pair_paths[f])) res = ftp.retrbinary( 'RETR %s' % ENA_reads_pair_paths[f], open(local_reads_pair_paths[f], 'wb').write) print('FTP: %s' % res) print('Calculating checksum . . .') actual_checksum = calc_checksum(local_reads_pair_paths[f]) if actual_checksum == expected_checksum: print('File checksum matches: %s.' % (expected_checksum)) downloaded_read_files[run_acc][f] = local_reads_pair_paths[ f] else: print('Checksum mismatch for: %s') if len(run_acc_list) > 1: # report durations, time left etc _report_time(start_time, cnum, len(run_acc_list)) if len(failed) > 0: print( 'WARNING: some accession numbers did not return a result from ENA' ) print( 'Try searching http://www.ebi.ac.uk/ena in a web-browser for:') print(', '.join(failed)) self.read_files = downloaded_read_files
def SPAdes(self, exe=[], output_folder=['assemblies', 'SPAdes'], mem_num_gigs=8, max_cpus=-1, single_assembly=False, careful=True, only_assembler=False): ''' de novo assembly of short reads using SPAdes By default, the provided short reads in dictionary: self.paths_to_reads will be assembled separately, unless single_assembly set to True in which case each set of paired read fastq files will be used in a single assembly. http://spades.bioinf.spbau.ru/release3.6.1/manual.html relevent inputs: -o <output_dir> Specify the output directory. Required option. --sc required for MDA (single-cell) data. --only-error-correction --only-assembler --careful reduce the number of mismatches and short indels. Run MismatchCorrector – a post processing tool. Recommended. --continue from the specified output folder starting from the last available check-point --restart-from <check_point> ec start from error correction as restart assembly module from the first iteration k<int> restart from the iteration with specified k values, e.g. k55 mc restart mismatch correction --pe1-12 <file_name> interlaced forward and reverse paired-end reads. --pe1-1 <file_name> File with forward reads. --pe1-2 <file_name> File with reverse reads. --pe1-s <file_name> File with unpaired reads . . use --pe2-... for next library --threads <int> --memory <int> max memory in Gb -k <int,int,...> Comma-separated list of odd ascending k-mers If --sc is set the default value are 21,33,55, for multicell data sets it is auto --cov-cutoff <float> positive float value, or 'auto', or 'off'. Default value is 'off' ''' assert isinstance( output_folder, list), 'Provide output folder as list of folders forming path' base_output_path = _os.path.sep.join(output_folder) if not _os.path.exists(base_output_path): _os.makedirs(base_output_path) # max threads is slightly different to cpus # . . can probably use more max_processes = _decide_max_processes(max_cpus) # if an exe is not provided, use that stored in Dependencies if len(exe): use_exe = _os.path.sep.join(exe) else: from baga import Dependencies use_exe = _get_exe_path('spades') def run_SPAdes(cmd): proc = _subprocess.Popen(cmd, stdout=_subprocess.PIPE, stderr=_subprocess.PIPE) # allow for failed SPAdes runs (possibly caused by small fastq files) <== but also check they were actually built properly try: stdout_value, stderr_value = proc.communicate() checkthese = [] getline = False for line in stdout_value.split('\n'): if 'Warnings saved to' in line: getline = False if getline: l = line.rstrip() if len(l): checkthese += [l] if 'SPAdes pipeline finished WITH WARNINGS!' in line: getline = True if len(checkthese): print('SPAdes completed with warnings:\n{}\n'.format( '\n'.join(checkthese))) else: print('SPAdes completed without warnings') # with open('___SPAdes_{}_good_{}.log'.format(cnum, thetime), 'w') as fout: # fout.write(stdout_value) path2contigs = _os.path.sep.join( [this_output_path, 'contigs.fasta']) except _subprocess.CalledProcessError as e: print('SPAdes probably did not complete: error returned ({})'. format(proc.returncode)) print('Error: {}'.format(e)) print( 'Writing some info relevent to SPAdes crash to ___SPAdes_{}_bad_{}.log' .format(cnum, thetime)) with open('___SPAdes_{}_bad_{}.log'.format(cnum, thetime), 'w') as fout: fout.write(dir(proc)) fout.write('\n' + str(e.returncode) + '\n') fout.write( _os.path.sep.join([this_output_path, 'contigs.fasta'])) path2contigs = None return (path2contigs) if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] contigs = {} if single_assembly: print( 'Combining reads aligned at multiple regions into single assembly' ) if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] for cnum, (pairname, files) in enumerate(self.read_files.items()): # allow use of tuples or dicts by converting dicts to lists if isinstance(files, dict): use_files = [] for k, v in sorted(files.items()): use_files += [v] else: use_files = files cmd += ['--pe{}-1'.format(cnum + 1), use_files[0]] cmd += ['--pe{}-2'.format(cnum + 1), use_files[1]] try: # use unpaired reads if available cmd += ['--pe{}-s'.format(cnum + 1), use_files[2]] except IndexError: pass try: # add a second library if provided if isinstance(self.read_files2[pairname], dict): # if a dict supplied, make it a list use_files2 = [] for k, v in sorted(self.read_files2[pairname].items()): use_files2 += [v] else: use_files2 = self.read_files2[pairname] cmd += ['--pe{}-1'.format(cnum + 2), use_files2[0]] cmd += ['--pe{}-2'.format(cnum + 2), use_files2[1]] try: cmd += ['--pe{}-s'.format(cnum + 2), use_files2[2]] except IndexError: pass except AttributeError: pass ## this isn't very flexible: # retain <sample>__<genome> from pairname: # pairname == <sample>__<genome>_<start>-<end>+<padding> # and replace with multiregion folder = '{}__{}_{}'.format( pairname.split('__')[0], pairname.split('__')[1].split('_')[0], 'multi_region') this_output_path = _os.path.sep.join(output_folder + [folder]) if not _os.path.exists(this_output_path): _os.makedirs(this_output_path) cmd += ['-o', this_output_path] cmd += ['--threads', str(max_processes)] cmd += ['--memory', str(mem_num_gigs)] if only_assembler: cmd += ['--only-assembler'] if careful: cmd += ['--careful'] thetime = _time.asctime(_time.localtime(_time.time())) print('about to launch SPAdes . . . at {}'.format(thetime)) print(' '.join(cmd)) contigs['multi_region'] = run_SPAdes(cmd) else: start_time = _time.time() # prepare commandline and launch each SPAdes assembly contigs = {} for cnum, (pairname, files) in enumerate(sorted(self.read_files.items())): if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] # allow use of tuples or dicts by converting dicts to lists if isinstance(files, dict): use_files = [] for k, v in sorted(files.items()): use_files += [v] else: use_files = files cmd += ['--pe1-1', use_files[0]] cmd += ['--pe1-2', use_files[1]] try: # use unpaired reads if available cmd += ['--pe1-s', use_files[2]] except IndexError: pass try: # add a second library if provided if isinstance(self.read_files2[pairname], dict): # if a dict supplied, make it a list use_files2 = [] for k, v in sorted(self.read_files2[pairname].items()): use_files2 += [v] else: use_files2 = self.read_files2[pairname] cmd += ['--pe2-1', use_files2[0]] cmd += ['--pe2-2', use_files2[1]] try: cmd += ['--pe2-s', use_files2[2]] except IndexError: pass except AttributeError: pass this_output_path = _os.path.sep.join(output_folder + [pairname]) if not _os.path.exists(this_output_path): _os.makedirs(this_output_path) cmd += ['-o', this_output_path] cmd += ['--threads', str(max_processes)] cmd += ['--memory', str(mem_num_gigs)] if only_assembler: cmd += ['--only-assembler'] if careful: cmd += ['--careful'] thetime = _time.asctime(_time.localtime(_time.time())) print('about to launch SPAdes . . . at {}'.format(thetime)) print(' '.join(cmd)) contigs[pairname] = run_SPAdes(cmd) if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) self.paths_to_contigs = contigs
def subsample(self, genome_size = 6601757, read_cov_depth = 80, pc_loss = 0.2, force = False, cov_closeness = 5): ''' Given the size in basepairs of a genome sequence, downsample fastq files to a desired average read coverage depth predicted after read alignment. Read lengths are taken from the file. By default, 20% are assumed to be lost at downstream quality control stages (e.g. quality score based trimming). The percent loss is used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent subsampling if within 5x coverage: avoids time consuming subsampling that will only make a small difference. ''' subsampled_read_files = {} start_time = _time.time() for cnum,(pairname,files) in enumerate(self.read_files.items()): processed_path_1 = insert_suffix(files[1], '_subsmp') processed_path_2 = insert_suffix(files[2], '_subsmp') if not all([_os.path.exists(processed_path_1), _os.path.exists(processed_path_2)]) \ or force: if files[1][-2:] == 'gz': fh1 = _gzip.open(files[1]) else: fh1 = open(files[1]) aread = _SeqIO.parse(fh1, 'fastq').next() read_len = len(aread.seq) print('Counting reads in %s' % files[1]) fh1.seek(0) lines = 0 # report per half million reads interval = 2000000 nextreport = interval for line in fh1: lines += 1 if lines == nextreport: print('{:,} reads'.format(lines/4)) nextreport += interval totalreads = lines / 4.0 print('Found %s reads' % totalreads) full_depth_coverage = read_len * 2 * totalreads * (1 - pc_loss) / genome_size print('These paired read files would provide approximately {:.1f}x coverage depth'.format(full_depth_coverage)) numreads2keep = int( round(genome_size * read_cov_depth / (read_len * 2) / (1 - pc_loss), 0) ) if numreads2keep >= totalreads: print('This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.'.format(full_depth_coverage, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue elif full_depth_coverage < read_cov_depth + cov_closeness: print('This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.'.format(full_depth_coverage, cov_closeness, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue else: print('For approximately {}x read coverage, will retain {} of {} {}bp read pairs'.format( read_cov_depth, numreads2keep, totalreads, read_len)) fh1.seek(0) if files[2][-2:] == 'gz': fh2 = _gzip.open(files[2]) else: fh2 = open(files[2]) fout1 = _gzip.open(processed_path_1, 'wb') fout2 = _gzip.open(processed_path_2, 'wb') batch_size = 200000 keep_per_pop = int(numreads2keep / float(totalreads) * batch_size) + 1 nextwrite = batch_size written = 0 n1 = 0 n2 = 0 these_lines1 = [] these_lines2 = [] reportfreq = 10 thisreport = 0 print('Subsampling . . .') for line in fh1: these_lines1 += [line] if len(these_lines1) % 4 == 0: n1 += 1 if n1 == nextwrite: keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) these_lines1 = [] written += keep_per_pop thisreport += 1 if thisreport == reportfreq or written == keep_per_pop: # report first time and at intevals print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_1)) for line2 in fh2: these_lines2 += [line2] if len(these_lines2) % 4 == 0: n2 += 1 if n2 == nextwrite: keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout2.write(''.join(keep_these)) these_lines2 = [] if thisreport == reportfreq or written == keep_per_pop: thisreport = 0 print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_2)) nextwrite += batch_size break # write remainder remainder = nextwrite - n1 keep_in_remainder = int(keep_per_pop * (remainder / float(batch_size))) + 1 keep_indices = sorted(_sample(xrange(remainder), keep_in_remainder)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) written += keep_in_remainder print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_1)) # get remainder for line2 in fh2: these_lines2 += [line2] # write remainder keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') ###### check why keep_these was empty fout2.write(''.join(keep_these)) print('Written {:,} reads ({:.1%}) to {}'.format(written, written/float(numreads2keep), processed_path_2)) # not sure if this is quicker/slower (more calls to .join()) # this_read = [] # for line in fh1: # this_read += [line] # if len(this_read) == 4: # these_reads1 += [''.join(this_read)] # #these_reads1 += this_read # this_read = [] # n1 += 1 # if n1 == nextwrite: # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop)) # # try parsing a read for QC # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq') # fout1.write(''.join([these_reads1[i] for i in keep_indices])) # these_reads1 = [] # written += keep_per_pop # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_1)) # for line2 in fh2: # this_read += [line2] # if len(this_read) == 4: # these_reads2 += [''.join(this_read)] # this_read = [] # n2 += 1 # if n2 == nextwrite: # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq') # fout2.write(''.join([these_reads2[i] for i in keep_indices])) # these_reads2 = [] # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_2)) # nextwrite += batch_size # break fout1.close() fout2.close() fh1.close() fh2.close() else: print('Found:') print(processed_path_1) print(processed_path_2) print('use "force = True" to overwrite') if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = processed_path_1 subsampled_read_files[pairname][2] = processed_path_2 # replace here as this step is optional self.fullsized_read_files = list(self.read_files) self.read_files = subsampled_read_files
def SPAdes(self, exe = [], output_folder = ['assemblies','SPAdes'], mem_num_gigs = 8, max_cpus = -1, single_assembly = False, careful = True, only_assembler = False): ''' de novo assembly of short reads using SPAdes By default, the provided short reads in dictionary: self.paths_to_reads will be assembled separately, unless single_assembly set to True in which case each set of paired read fastq files will be used in a single assembly. http://spades.bioinf.spbau.ru/release3.6.1/manual.html relevent inputs: -o <output_dir> Specify the output directory. Required option. --sc required for MDA (single-cell) data. --only-error-correction --only-assembler --careful reduce the number of mismatches and short indels. Run MismatchCorrector – a post processing tool. Recommended. --continue from the specified output folder starting from the last available check-point --restart-from <check_point> ec start from error correction as restart assembly module from the first iteration k<int> restart from the iteration with specified k values, e.g. k55 mc restart mismatch correction --pe1-12 <file_name> interlaced forward and reverse paired-end reads. --pe1-1 <file_name> File with forward reads. --pe1-2 <file_name> File with reverse reads. --pe1-s <file_name> File with unpaired reads . . use --pe2-... for next library --threads <int> --memory <int> max memory in Gb -k <int,int,...> Comma-separated list of odd ascending k-mers If --sc is set the default value are 21,33,55, for multicell data sets it is auto --cov-cutoff <float> positive float value, or 'auto', or 'off'. Default value is 'off' ''' assert isinstance(output_folder, list), 'Provide output folder as list of folders forming path' base_output_path = _os.path.sep.join(output_folder) if not _os.path.exists(base_output_path): _os.makedirs(base_output_path) # max threads is slightly different to cpus # . . can probably use more max_processes = _decide_max_processes( max_cpus ) # if an exe is not provided, use that stored in Dependencies if len(exe): use_exe = _os.path.sep.join(exe) else: from baga import Dependencies use_exe = _get_exe_path('spades') def run_SPAdes(cmd): proc = _subprocess.Popen(cmd, stdout=_subprocess.PIPE, stderr=_subprocess.PIPE) # allow for failed SPAdes runs (possibly caused by small fastq files) <== but also check they were actually built properly try: stdout_value, stderr_value = proc.communicate() checkthese = [] getline = False for line in stdout_value.split('\n'): if 'Warnings saved to' in line: getline = False if getline: l = line.rstrip() if len(l): checkthese += [l] if 'SPAdes pipeline finished WITH WARNINGS!' in line: getline = True if len(checkthese): print('SPAdes completed with warnings:\n{}\n'.format('\n'.join(checkthese))) else: print('SPAdes completed without warnings') # with open('___SPAdes_{}_good_{}.log'.format(cnum, thetime), 'w') as fout: # fout.write(stdout_value) path2contigs = _os.path.sep.join([this_output_path,'contigs.fasta']) except _subprocess.CalledProcessError as e: print('SPAdes probably did not complete: error returned ({})'.format(proc.returncode)) print('Error: {}'.format(e)) print('Writing some info relevent to SPAdes crash to ___SPAdes_{}_bad_{}.log'.format(cnum, thetime)) with open('___SPAdes_{}_bad_{}.log'.format(cnum, thetime), 'w') as fout: fout.write(dir(proc)) fout.write('\n' + str(e.returncode) + '\n') fout.write(_os.path.sep.join([this_output_path,'contigs.fasta'])) path2contigs = None return(path2contigs) if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] contigs = {} if single_assembly: print('Combining reads aligned at multiple regions into single assembly') if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] for cnum, (pairname, files) in enumerate(self.read_files.items()): # allow use of tuples or dicts by converting dicts to lists if isinstance(files, dict): use_files = [] for k,v in sorted(files.items()): use_files += [v] else: use_files = files cmd += ['--pe{}-1'.format(cnum+1), use_files[0]] cmd += ['--pe{}-2'.format(cnum+1), use_files[1]] try: # use unpaired reads if available cmd += ['--pe{}-s'.format(cnum+1), use_files[2]] except IndexError: pass try: # add a second library if provided if isinstance(self.read_files2[pairname], dict): # if a dict supplied, make it a list use_files2 = [] for k,v in sorted(self.read_files2[pairname].items()): use_files2 += [v] else: use_files2 = self.read_files2[pairname] cmd += ['--pe{}-1'.format(cnum+2), use_files2[0]] cmd += ['--pe{}-2'.format(cnum+2), use_files2[1]] try: cmd += ['--pe{}-s'.format(cnum+2), use_files2[2]] except IndexError: pass except AttributeError: pass ## this isn't very flexible: # retain <sample>__<genome> from pairname: # pairname == <sample>__<genome>_<start>-<end>+<padding> # and replace with multiregion folder = '{}__{}_{}'.format(pairname.split('__')[0], pairname.split('__')[1].split('_')[0], 'multi_region') this_output_path = _os.path.sep.join(output_folder + [folder]) if not _os.path.exists(this_output_path): _os.makedirs(this_output_path) cmd += ['-o', this_output_path] cmd += ['--threads', str(max_processes)] cmd += ['--memory', str(mem_num_gigs)] if only_assembler: cmd += ['--only-assembler'] if careful: cmd += ['--careful'] thetime = _time.asctime( _time.localtime(_time.time()) ) print('about to launch SPAdes . . . at {}'.format(thetime)) print(' '.join(cmd)) contigs['multi_region'] = run_SPAdes(cmd) else: start_time = _time.time() # prepare commandline and launch each SPAdes assembly contigs = {} for cnum, (pairname, files) in enumerate(sorted(self.read_files.items())): if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] # allow use of tuples or dicts by converting dicts to lists if isinstance(files, dict): use_files = [] for k,v in sorted(files.items()): use_files += [v] else: use_files = files cmd += ['--pe1-1', use_files[0]] cmd += ['--pe1-2', use_files[1]] try: # use unpaired reads if available cmd += ['--pe1-s', use_files[2]] except IndexError: pass try: # add a second library if provided if isinstance(self.read_files2[pairname], dict): # if a dict supplied, make it a list use_files2 = [] for k,v in sorted(self.read_files2[pairname].items()): use_files2 += [v] else: use_files2 = self.read_files2[pairname] cmd += ['--pe2-1', use_files2[0]] cmd += ['--pe2-2', use_files2[1]] try: cmd += ['--pe2-s', use_files2[2]] except IndexError: pass except AttributeError: pass this_output_path = _os.path.sep.join(output_folder + [pairname]) if not _os.path.exists(this_output_path): _os.makedirs(this_output_path) cmd += ['-o', this_output_path] cmd += ['--threads', str(max_processes)] cmd += ['--memory', str(mem_num_gigs)] if only_assembler: cmd += ['--only-assembler'] if careful: cmd += ['--careful'] thetime = _time.asctime( _time.localtime(_time.time()) ) print('about to launch SPAdes . . . at {}'.format(thetime)) print(' '.join(cmd)) contigs[pairname] = run_SPAdes(cmd) if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) self.paths_to_contigs = contigs
def subsample(self, genome_size=6601757, read_cov_depth=80, pc_loss=0.2, force=False, cov_closeness=5): ''' Given the size in basepairs of a genome sequence, downsample fastq files to a desired average read coverage depth predicted after read alignment. Read lengths are taken from the file. By default, 20% are assumed to be lost at downstream quality control stages (e.g. quality score based trimming). The percent loss is used in coverage depth estimation. cov_closeness, which defaults to 5, will prevent subsampling if within 5x coverage: avoids time consuming subsampling that will only make a small difference. ''' subsampled_read_files = {} start_time = _time.time() for cnum, (pairname, files) in enumerate(self.read_files.items()): processed_path_1 = insert_suffix(files[1], '_subsmp') processed_path_2 = insert_suffix(files[2], '_subsmp') if not all([_os.path.exists(processed_path_1), _os.path.exists(processed_path_2)]) \ or force: if files[1][-2:] == 'gz': fh1 = _gzip.open(files[1]) else: fh1 = open(files[1]) aread = _SeqIO.parse(fh1, 'fastq').next() read_len = len(aread.seq) print('Counting reads in %s' % files[1]) fh1.seek(0) lines = 0 # report per half million reads interval = 2000000 nextreport = interval for line in fh1: lines += 1 if lines == nextreport: print('{:,} reads'.format(lines / 4)) nextreport += interval totalreads = lines / 4.0 print('Found %s reads' % totalreads) full_depth_coverage = read_len * 2 * totalreads * ( 1 - pc_loss) / genome_size print( 'These paired read files would provide approximately {:.1f}x coverage depth' .format(full_depth_coverage)) numreads2keep = int( round( genome_size * read_cov_depth / (read_len * 2) / (1 - pc_loss), 0)) if numreads2keep >= totalreads: print( 'This pair of read files is estimated to provide only {:.1f}x coverage, but {}x requested.' .format(full_depth_coverage, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue elif full_depth_coverage < read_cov_depth + cov_closeness: print( 'This pair of read files is estimated to provide {:.1f}x coverage which is within {}x of {}x requested.' .format(full_depth_coverage, cov_closeness, read_cov_depth)) print('No sampling performed. Original files will be used') # pass original files over with subsampled subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = files[1] subsampled_read_files[pairname][2] = files[2] fh1.close() if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) continue else: print( 'For approximately {}x read coverage, will retain {} of {} {}bp read pairs' .format(read_cov_depth, numreads2keep, totalreads, read_len)) fh1.seek(0) if files[2][-2:] == 'gz': fh2 = _gzip.open(files[2]) else: fh2 = open(files[2]) fout1 = _gzip.open(processed_path_1, 'wb') fout2 = _gzip.open(processed_path_2, 'wb') batch_size = 200000 keep_per_pop = int( numreads2keep / float(totalreads) * batch_size) + 1 nextwrite = batch_size written = 0 n1 = 0 n2 = 0 these_lines1 = [] these_lines2 = [] reportfreq = 10 thisreport = 0 print('Subsampling . . .') for line in fh1: these_lines1 += [line] if len(these_lines1) % 4 == 0: n1 += 1 if n1 == nextwrite: keep_indices = sorted( _sample(xrange(batch_size), keep_per_pop)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read( _StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) these_lines1 = [] written += keep_per_pop thisreport += 1 if thisreport == reportfreq or written == keep_per_pop: # report first time and at intevals print( 'Written {:,} reads ({:.1%}) to {}'.format( written, written / float(numreads2keep), processed_path_1)) for line2 in fh2: these_lines2 += [line2] if len(these_lines2) % 4 == 0: n2 += 1 if n2 == nextwrite: keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read( _StringIO(''.join(keep_these[:4])), 'fastq') fout2.write(''.join(keep_these)) these_lines2 = [] if thisreport == reportfreq or written == keep_per_pop: thisreport = 0 print( 'Written {:,} reads ({:.1%}) to {}' .format( written, written / float(numreads2keep), processed_path_2)) nextwrite += batch_size break # write remainder remainder = nextwrite - n1 keep_in_remainder = int( keep_per_pop * (remainder / float(batch_size))) + 1 keep_indices = sorted( _sample(xrange(remainder), keep_in_remainder)) keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines1[i1:i2] # try parsing a read for QC assert _SeqIO.read(_StringIO(''.join(keep_these[:4])), 'fastq') fout1.write(''.join(keep_these)) written += keep_in_remainder print('Written {:,} reads ({:.1%}) to {}'.format( written, written / float(numreads2keep), processed_path_1)) # get remainder for line2 in fh2: these_lines2 += [line2] # write remainder keep_these = [] for i in keep_indices: i1 = i * 4 i2 = i * 4 + 4 keep_these += these_lines2[i1:i2] assert _SeqIO.read( _StringIO(''.join(keep_these[:4])), 'fastq') ###### check why keep_these was empty fout2.write(''.join(keep_these)) print('Written {:,} reads ({:.1%}) to {}'.format( written, written / float(numreads2keep), processed_path_2)) # not sure if this is quicker/slower (more calls to .join()) # this_read = [] # for line in fh1: # this_read += [line] # if len(this_read) == 4: # these_reads1 += [''.join(this_read)] # #these_reads1 += this_read # this_read = [] # n1 += 1 # if n1 == nextwrite: # keep_indices = sorted(_sample(xrange(batch_size), keep_per_pop)) # # try parsing a read for QC # assert _SeqIO.read(_StringIO(these_reads1[0]), 'fastq') # fout1.write(''.join([these_reads1[i] for i in keep_indices])) # these_reads1 = [] # written += keep_per_pop # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_1)) # for line2 in fh2: # this_read += [line2] # if len(this_read) == 4: # these_reads2 += [''.join(this_read)] # this_read = [] # n2 += 1 # if n2 == nextwrite: # assert _SeqIO.read(_StringIO(these_reads2[0]), 'fastq') # fout2.write(''.join([these_reads2[i] for i in keep_indices])) # these_reads2 = [] # print('Written {:,} reads ({:.2%}) to {}'.format(written, # written/float(numreads2keep), # processed_path_2)) # nextwrite += batch_size # break fout1.close() fout2.close() fh1.close() fh2.close() else: print('Found:') print(processed_path_1) print(processed_path_2) print('use "force = True" to overwrite') if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) subsampled_read_files[pairname] = {} subsampled_read_files[pairname][1] = processed_path_1 subsampled_read_files[pairname][2] = processed_path_2 # replace here as this step is optional self.fullsized_read_files = list(self.read_files) self.read_files = subsampled_read_files