def get_git(name, description, source, url, commit, checksum, destination, preparation, checker): ''' Get a dependency from git ''' if _os.path.realpath(_os.path.curdir) != destination: try: _os.chdir(destination) except OSError: _os.makedirs(destination) _os.chdir(destination) try: # clear any previous verions _shutil.rmtree(url.split('/')[-1].replace('.git', '')) except OSError: pass git_server = url.replace('https://', '').replace('http://', '').split('/')[0] print('Downloading {} via git from {} . . .'.format(name, git_server)) _subprocess.call(['git', 'clone', url]) _os.chdir(url.split('/')[-1].replace('.git', '')) _subprocess.call(['git', 'checkout', commit]) # if repo uses git submodules, those will be set to the correct revisions for this commit # else will do nothing _subprocess.call(['git', 'submodule', 'update', '--init']) working_dir = _os.path.sep.join( [destination, url.split('/')[-1].replace('.git', '')]) if preparation is not None: for do_this in preparation: if isinstance(do_this['arguments'], dict): do_this['function'](**do_this['arguments']) else: do_this['function'](*do_this['arguments']) # restore position in path if a prepare changed it if working_dir != _os.path.realpath(_os.path.curdir): _os.chdir(working_dir) _os.chdir(_os.path.pardir) _os.chdir(_os.path.pardir)
def get_git(name, description, source, url, commit, checksum, destination, preparation, checker): ''' Get a dependency from git ''' if _os.path.realpath(_os.path.curdir) != destination: try: _os.chdir(destination) except OSError: _os.makedirs(destination) _os.chdir(destination) try: # clear any previous verions _shutil.rmtree(url.split('/')[-1].replace('.git','')) except OSError: pass git_server = url.replace('https://','').replace('http://','').split('/')[0] print('Downloading {} via git from {} . . .'.format(name, git_server)) _subprocess.call(['git', 'clone', url]) _os.chdir(url.split('/')[-1].replace('.git','')) _subprocess.call(['git', 'checkout', commit]) # if repo uses git submodules, those will be set to the correct revisions for this commit # else will do nothing _subprocess.call(['git', 'submodule', 'update', '--init']) working_dir = _os.path.sep.join([destination,url.split('/')[-1].replace('.git','')]) if preparation is not None: for do_this in preparation: if isinstance(do_this['arguments'], dict): do_this['function'](**do_this['arguments']) else: do_this['function'](*do_this['arguments']) # restore position in path if a prepare changed it if working_dir != _os.path.realpath(_os.path.curdir): _os.chdir(working_dir) _os.chdir(_os.path.pardir) _os.chdir(_os.path.pardir)
def getFromENA(self, run_acc_list, ftp_server_url = 'ftp.sra.ebi.ac.uk', local_reads_path = ['reads']): ''' Given a list of 'run' accession numbers for paired end short read analyses, download the read files from the European Nucleotide Archive. If using a mirror server, supply an alternative for 'ftp_server_url'. 'local_reads_path' can be a path string or list or folder names. ''' if isinstance(local_reads_path, list): local_reads_path = _os.path.sep.join(local_reads_path) if not _os.path.exists(local_reads_path): _os.makedirs(local_reads_path) print('Logging in to %s' % ftp_server_url) ftp = _FTP(ftp_server_url) # anonymous login print(ftp.login()) def check_connection(ftp): try: print('FTP: %s' % ftp.voidcmd("NOOP")) # http://docs.python.org/2/library/ftplib.html return(True) except IOError as e: print('Seems to be a problem with the connection to FTP server:') print('I/O error({0}): {1}'.format(e.errno, e.strerror) ) return(False) def calc_checksum(filepath): hasher = _md5() handle = open(filepath, 'rb') buff = handle.read(65536) while len(buff) > 0: hasher.update(buff) buff = handle.read(65536) return(hasher.hexdigest()) downloaded_read_files = {} start_time = _time.time() failed = [] for cnum,run_acc in enumerate(run_acc_list): query_url_base = 'http://www.ebi.ac.uk/ena/data/warehouse/search?query=' success = False tries = 0 max_tries = 5 while not success: rest_req = '"run_accession=%s"&result=read_run&fields=fastq_ftp,fastq_md5&display=report' % run_acc print('Sending query to ENA:\n%s' % rest_req) result = _urllib2.urlopen(query_url_base + rest_req).read() print('ENA accession numbers query result:\n%s' % result) if result.count('ERR') == 7: success = True else: print('Query result from ENA was unexpected on attempt %s of %s' % (tries, max_tries)) _time.sleep(0.5) tries += 1 if tries == max_tries: print('Attempt %s failed. Try again later and if problem persists, report bug.' % tries) failed += [run_acc] break #_sys.exit(1) if not success: continue md5s = result.split('\n')[-2].split('\t')[-1][:-1].split(';') ENA_paths = result.split('\n')[-2].split('\t')[-2][:-1].split(';') ENA_reads_pair_paths = {} ENA_reads_pair_paths[1] = ENA_paths[0].replace(ftp_server_url, '') ENA_reads_pair_paths[2] = ENA_paths[1].replace(ftp_server_url, '') local_reads_pair_paths = {} local_reads_pair_paths[1] = local_reads_path + \ _os.path.sep + \ ENA_reads_pair_paths[1].split('/')[-1] local_reads_pair_paths[2] = local_reads_path + \ _os.path.sep + \ ENA_reads_pair_paths[2].split('/')[-1] downloaded_read_files[run_acc] = {} for f in (1,2): # ensure connection is still open while not check_connection(ftp): _sleep(0.5) print('Attempting to re-establish connection . . .') ftp = _FTP(ftp_server_url) # anonymous login print(ftp.login()) pass expected_checksum = md5s[f - 1] exists = _os.path.exists(local_reads_pair_paths[f]) if exists: print('File %s for %s exists locally: %s' % (f, run_acc, local_reads_pair_paths[f])) actual_checksum = calc_checksum(local_reads_pair_paths[f]) if actual_checksum == expected_checksum: print('File checksum matches: %s. Skipping download' % (expected_checksum)) downloaded_read_files[run_acc][f] = local_reads_pair_paths[f] continue else: print('Checksum mismatch') print('Downloading via %s: %s' % (ftp_server_url, ENA_reads_pair_paths[f])) res = ftp.retrbinary('RETR %s' % ENA_reads_pair_paths[f], open(local_reads_pair_paths[f], 'wb').write) print('FTP: %s' % res) print('Calculating checksum . . .') actual_checksum = calc_checksum(local_reads_pair_paths[f]) if actual_checksum == expected_checksum: print('File checksum matches: %s.' % (expected_checksum)) downloaded_read_files[run_acc][f] = local_reads_pair_paths[f] else: print('Checksum mismatch for: %s') if len(run_acc_list) > 1: # report durations, time left etc _report_time(start_time, cnum, len(run_acc_list)) if len(failed) > 0: print('WARNING: some accession numbers did not return a result from ENA') print('Try searching http://www.ebi.ac.uk/ena in a web-browser for:') print(', '.join(failed)) self.read_files = downloaded_read_files
def getFromENA(self, run_acc_list, ftp_server_url='ftp.sra.ebi.ac.uk', local_reads_path=['reads']): ''' Given a list of 'run' accession numbers for paired end short read analyses, download the read files from the European Nucleotide Archive. If using a mirror server, supply an alternative for 'ftp_server_url'. 'local_reads_path' can be a path string or list or folder names. ''' if isinstance(local_reads_path, list): local_reads_path = _os.path.sep.join(local_reads_path) if not _os.path.exists(local_reads_path): _os.makedirs(local_reads_path) print('Logging in to %s' % ftp_server_url) ftp = _FTP(ftp_server_url) # anonymous login print(ftp.login()) def check_connection(ftp): try: print('FTP: %s' % ftp.voidcmd("NOOP")) # http://docs.python.org/2/library/ftplib.html return (True) except IOError as e: print( 'Seems to be a problem with the connection to FTP server:') print('I/O error({0}): {1}'.format(e.errno, e.strerror)) return (False) def calc_checksum(filepath): hasher = _md5() handle = open(filepath, 'rb') buff = handle.read(65536) while len(buff) > 0: hasher.update(buff) buff = handle.read(65536) return (hasher.hexdigest()) downloaded_read_files = {} start_time = _time.time() failed = [] for cnum, run_acc in enumerate(run_acc_list): query_url_base = 'http://www.ebi.ac.uk/ena/data/warehouse/search?query=' success = False tries = 0 max_tries = 5 while not success: rest_req = '"run_accession=%s"&result=read_run&fields=fastq_ftp,fastq_md5&display=report' % run_acc print('Sending query to ENA:\n%s' % rest_req) result = _urllib2.urlopen(query_url_base + rest_req).read() print('ENA accession numbers query result:\n%s' % result) if result.count('ERR') == 7: success = True else: print( 'Query result from ENA was unexpected on attempt %s of %s' % (tries, max_tries)) _time.sleep(0.5) tries += 1 if tries == max_tries: print( 'Attempt %s failed. Try again later and if problem persists, report bug.' % tries) failed += [run_acc] break #_sys.exit(1) if not success: continue md5s = result.split('\n')[-2].split('\t')[-1][:-1].split(';') ENA_paths = result.split('\n')[-2].split('\t')[-2][:-1].split(';') ENA_reads_pair_paths = {} ENA_reads_pair_paths[1] = ENA_paths[0].replace(ftp_server_url, '') ENA_reads_pair_paths[2] = ENA_paths[1].replace(ftp_server_url, '') local_reads_pair_paths = {} local_reads_pair_paths[1] = local_reads_path + \ _os.path.sep + \ ENA_reads_pair_paths[1].split('/')[-1] local_reads_pair_paths[2] = local_reads_path + \ _os.path.sep + \ ENA_reads_pair_paths[2].split('/')[-1] downloaded_read_files[run_acc] = {} for f in (1, 2): # ensure connection is still open while not check_connection(ftp): _sleep(0.5) print('Attempting to re-establish connection . . .') ftp = _FTP(ftp_server_url) # anonymous login print(ftp.login()) pass expected_checksum = md5s[f - 1] exists = _os.path.exists(local_reads_pair_paths[f]) if exists: print('File %s for %s exists locally: %s' % (f, run_acc, local_reads_pair_paths[f])) actual_checksum = calc_checksum(local_reads_pair_paths[f]) if actual_checksum == expected_checksum: print('File checksum matches: %s. Skipping download' % (expected_checksum)) downloaded_read_files[run_acc][ f] = local_reads_pair_paths[f] continue else: print('Checksum mismatch') print('Downloading via %s: %s' % (ftp_server_url, ENA_reads_pair_paths[f])) res = ftp.retrbinary( 'RETR %s' % ENA_reads_pair_paths[f], open(local_reads_pair_paths[f], 'wb').write) print('FTP: %s' % res) print('Calculating checksum . . .') actual_checksum = calc_checksum(local_reads_pair_paths[f]) if actual_checksum == expected_checksum: print('File checksum matches: %s.' % (expected_checksum)) downloaded_read_files[run_acc][f] = local_reads_pair_paths[ f] else: print('Checksum mismatch for: %s') if len(run_acc_list) > 1: # report durations, time left etc _report_time(start_time, cnum, len(run_acc_list)) if len(failed) > 0: print( 'WARNING: some accession numbers did not return a result from ENA' ) print( 'Try searching http://www.ebi.ac.uk/ena in a web-browser for:') print(', '.join(failed)) self.read_files = downloaded_read_files
def SPAdes(self, exe=[], output_folder=['assemblies', 'SPAdes'], mem_num_gigs=8, max_cpus=-1, single_assembly=False, careful=True, only_assembler=False): ''' de novo assembly of short reads using SPAdes By default, the provided short reads in dictionary: self.paths_to_reads will be assembled separately, unless single_assembly set to True in which case each set of paired read fastq files will be used in a single assembly. http://spades.bioinf.spbau.ru/release3.6.1/manual.html relevent inputs: -o <output_dir> Specify the output directory. Required option. --sc required for MDA (single-cell) data. --only-error-correction --only-assembler --careful reduce the number of mismatches and short indels. Run MismatchCorrector – a post processing tool. Recommended. --continue from the specified output folder starting from the last available check-point --restart-from <check_point> ec start from error correction as restart assembly module from the first iteration k<int> restart from the iteration with specified k values, e.g. k55 mc restart mismatch correction --pe1-12 <file_name> interlaced forward and reverse paired-end reads. --pe1-1 <file_name> File with forward reads. --pe1-2 <file_name> File with reverse reads. --pe1-s <file_name> File with unpaired reads . . use --pe2-... for next library --threads <int> --memory <int> max memory in Gb -k <int,int,...> Comma-separated list of odd ascending k-mers If --sc is set the default value are 21,33,55, for multicell data sets it is auto --cov-cutoff <float> positive float value, or 'auto', or 'off'. Default value is 'off' ''' assert isinstance( output_folder, list), 'Provide output folder as list of folders forming path' base_output_path = _os.path.sep.join(output_folder) if not _os.path.exists(base_output_path): _os.makedirs(base_output_path) # max threads is slightly different to cpus # . . can probably use more max_processes = _decide_max_processes(max_cpus) # if an exe is not provided, use that stored in Dependencies if len(exe): use_exe = _os.path.sep.join(exe) else: from baga import Dependencies use_exe = _get_exe_path('spades') def run_SPAdes(cmd): proc = _subprocess.Popen(cmd, stdout=_subprocess.PIPE, stderr=_subprocess.PIPE) # allow for failed SPAdes runs (possibly caused by small fastq files) <== but also check they were actually built properly try: stdout_value, stderr_value = proc.communicate() checkthese = [] getline = False for line in stdout_value.split('\n'): if 'Warnings saved to' in line: getline = False if getline: l = line.rstrip() if len(l): checkthese += [l] if 'SPAdes pipeline finished WITH WARNINGS!' in line: getline = True if len(checkthese): print('SPAdes completed with warnings:\n{}\n'.format( '\n'.join(checkthese))) else: print('SPAdes completed without warnings') # with open('___SPAdes_{}_good_{}.log'.format(cnum, thetime), 'w') as fout: # fout.write(stdout_value) path2contigs = _os.path.sep.join( [this_output_path, 'contigs.fasta']) except _subprocess.CalledProcessError as e: print('SPAdes probably did not complete: error returned ({})'. format(proc.returncode)) print('Error: {}'.format(e)) print( 'Writing some info relevent to SPAdes crash to ___SPAdes_{}_bad_{}.log' .format(cnum, thetime)) with open('___SPAdes_{}_bad_{}.log'.format(cnum, thetime), 'w') as fout: fout.write(dir(proc)) fout.write('\n' + str(e.returncode) + '\n') fout.write( _os.path.sep.join([this_output_path, 'contigs.fasta'])) path2contigs = None return (path2contigs) if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] contigs = {} if single_assembly: print( 'Combining reads aligned at multiple regions into single assembly' ) if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] for cnum, (pairname, files) in enumerate(self.read_files.items()): # allow use of tuples or dicts by converting dicts to lists if isinstance(files, dict): use_files = [] for k, v in sorted(files.items()): use_files += [v] else: use_files = files cmd += ['--pe{}-1'.format(cnum + 1), use_files[0]] cmd += ['--pe{}-2'.format(cnum + 1), use_files[1]] try: # use unpaired reads if available cmd += ['--pe{}-s'.format(cnum + 1), use_files[2]] except IndexError: pass try: # add a second library if provided if isinstance(self.read_files2[pairname], dict): # if a dict supplied, make it a list use_files2 = [] for k, v in sorted(self.read_files2[pairname].items()): use_files2 += [v] else: use_files2 = self.read_files2[pairname] cmd += ['--pe{}-1'.format(cnum + 2), use_files2[0]] cmd += ['--pe{}-2'.format(cnum + 2), use_files2[1]] try: cmd += ['--pe{}-s'.format(cnum + 2), use_files2[2]] except IndexError: pass except AttributeError: pass ## this isn't very flexible: # retain <sample>__<genome> from pairname: # pairname == <sample>__<genome>_<start>-<end>+<padding> # and replace with multiregion folder = '{}__{}_{}'.format( pairname.split('__')[0], pairname.split('__')[1].split('_')[0], 'multi_region') this_output_path = _os.path.sep.join(output_folder + [folder]) if not _os.path.exists(this_output_path): _os.makedirs(this_output_path) cmd += ['-o', this_output_path] cmd += ['--threads', str(max_processes)] cmd += ['--memory', str(mem_num_gigs)] if only_assembler: cmd += ['--only-assembler'] if careful: cmd += ['--careful'] thetime = _time.asctime(_time.localtime(_time.time())) print('about to launch SPAdes . . . at {}'.format(thetime)) print(' '.join(cmd)) contigs['multi_region'] = run_SPAdes(cmd) else: start_time = _time.time() # prepare commandline and launch each SPAdes assembly contigs = {} for cnum, (pairname, files) in enumerate(sorted(self.read_files.items())): if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] # allow use of tuples or dicts by converting dicts to lists if isinstance(files, dict): use_files = [] for k, v in sorted(files.items()): use_files += [v] else: use_files = files cmd += ['--pe1-1', use_files[0]] cmd += ['--pe1-2', use_files[1]] try: # use unpaired reads if available cmd += ['--pe1-s', use_files[2]] except IndexError: pass try: # add a second library if provided if isinstance(self.read_files2[pairname], dict): # if a dict supplied, make it a list use_files2 = [] for k, v in sorted(self.read_files2[pairname].items()): use_files2 += [v] else: use_files2 = self.read_files2[pairname] cmd += ['--pe2-1', use_files2[0]] cmd += ['--pe2-2', use_files2[1]] try: cmd += ['--pe2-s', use_files2[2]] except IndexError: pass except AttributeError: pass this_output_path = _os.path.sep.join(output_folder + [pairname]) if not _os.path.exists(this_output_path): _os.makedirs(this_output_path) cmd += ['-o', this_output_path] cmd += ['--threads', str(max_processes)] cmd += ['--memory', str(mem_num_gigs)] if only_assembler: cmd += ['--only-assembler'] if careful: cmd += ['--careful'] thetime = _time.asctime(_time.localtime(_time.time())) print('about to launch SPAdes . . . at {}'.format(thetime)) print(' '.join(cmd)) contigs[pairname] = run_SPAdes(cmd) if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) self.paths_to_contigs = contigs
def get_download(name, description, source, url, commit, checksum, destination, preparation, checker): ''' Download and unpack a dependancy ''' ## initialdir = _os.path.abspath(_os.curdir) try: _os.chdir(destination) except OSError: _os.makedirs(destination) _os.chdir(destination) if checksum: hasher_algorithm = checksum.split('=')[0] if hasher_algorithm == 'md5': hasher = _hashlib.md5() elif hasher_algorithm == 'sha1': hasher = _hashlib.sha1() elif hasher_algorithm == 'sha224': hasher = _hashlib.sha224() elif hasher_algorithm == 'sha256': hasher = _hashlib.sha256() elif hasher_algorithm == 'sha384': hasher = _hashlib.sha384() elif hasher_algorithm == 'sha512': hasher = _hashlib.sha512() else: print("{} checksums not implemented in Python's hashlib!".format( hasher_algorithm)) print('Downloading: %s' % url) req = _urllib2.urlopen(url) CHUNK = 16 * 1024 * 16 data = _cStringIO.StringIO() c = 0 for chunk in iter(lambda: req.read(CHUNK), ''): c += CHUNK print("{:,} bytes".format(c)) data.write(chunk) print('Download complete . . .') data.seek(0) if checksum: buff = data.read(65536) while len(buff) > 0: hasher.update(buff) buff = data.read(65536) e = '. . . checksum fail!' assert hasher.hexdigest() == checksum.split('=')[1], e print('. . . checksum passed!') data.seek(0) if url[-6:] == 'tar.gz': archive = _tarfile.open(mode="r:gz", fileobj=data) elif url[-7:] == 'tar.bz2': archive = _tarfile.open(mode="r:bz2", fileobj=data) elif url[-4:] == '.zip': archive = _zipfile.ZipFile(data) if destination == 'local_packages': # extract as a pypi python package release = url.split('/')[-1][:-7] print('Extracting {} to {}'.format( release, _os.path.sep.join([destination, name]))) c = 0 nostrip = {'pysam'} if name in nostrip: try: _shutil.rmtree(archive.getnames()[0]) except OSError: pass #_shutil.rmtree(_os.path.sep.join([destination, archive.getnames()[0]])) # some python modules should not be stripped . . more complex install for member in archive.getmembers(): if member.isreg(): archive.extract(member) print(member.name) c += 1 else: # others don't need additional compilation check_path1 = '{}/{}'.format(release, name) for member in archive.getmembers(): if member.isreg() and check_path1 in member.name: member.name = _os.path.sep.join( member.name.split(_os.path.sep)[1:]) archive.extract(member) c += 1 print('Extracted {} files'.format(c)) else: # extract as a generic external program archive.extractall() if preparation: for do_this in preparation: if 'just_packages' in do_this['arguments']: # this is the only thing that differentiates this prepare() # from others that need some chdir <== this should be improved # see dep dict curdir = _os.path.abspath(_os.curdir) _os.chdir(_os.path.pardir) do_this['function'](*do_this['arguments']['package_list']) # return to previous folder _os.chdir(curdir) else: extracted_base_dir = archive.getnames()[0].split( _os.path.sep)[0] curdir = _os.path.abspath(_os.curdir) # go to installed folder _os.chdir(_os.path.sep.join([destination, extracted_base_dir])) do_this['function'](**do_this['arguments']) # return to previous folder _os.chdir(curdir) _os.chdir(initialdir)
def SPAdes(self, exe = [], output_folder = ['assemblies','SPAdes'], mem_num_gigs = 8, max_cpus = -1, single_assembly = False, careful = True, only_assembler = False): ''' de novo assembly of short reads using SPAdes By default, the provided short reads in dictionary: self.paths_to_reads will be assembled separately, unless single_assembly set to True in which case each set of paired read fastq files will be used in a single assembly. http://spades.bioinf.spbau.ru/release3.6.1/manual.html relevent inputs: -o <output_dir> Specify the output directory. Required option. --sc required for MDA (single-cell) data. --only-error-correction --only-assembler --careful reduce the number of mismatches and short indels. Run MismatchCorrector – a post processing tool. Recommended. --continue from the specified output folder starting from the last available check-point --restart-from <check_point> ec start from error correction as restart assembly module from the first iteration k<int> restart from the iteration with specified k values, e.g. k55 mc restart mismatch correction --pe1-12 <file_name> interlaced forward and reverse paired-end reads. --pe1-1 <file_name> File with forward reads. --pe1-2 <file_name> File with reverse reads. --pe1-s <file_name> File with unpaired reads . . use --pe2-... for next library --threads <int> --memory <int> max memory in Gb -k <int,int,...> Comma-separated list of odd ascending k-mers If --sc is set the default value are 21,33,55, for multicell data sets it is auto --cov-cutoff <float> positive float value, or 'auto', or 'off'. Default value is 'off' ''' assert isinstance(output_folder, list), 'Provide output folder as list of folders forming path' base_output_path = _os.path.sep.join(output_folder) if not _os.path.exists(base_output_path): _os.makedirs(base_output_path) # max threads is slightly different to cpus # . . can probably use more max_processes = _decide_max_processes( max_cpus ) # if an exe is not provided, use that stored in Dependencies if len(exe): use_exe = _os.path.sep.join(exe) else: from baga import Dependencies use_exe = _get_exe_path('spades') def run_SPAdes(cmd): proc = _subprocess.Popen(cmd, stdout=_subprocess.PIPE, stderr=_subprocess.PIPE) # allow for failed SPAdes runs (possibly caused by small fastq files) <== but also check they were actually built properly try: stdout_value, stderr_value = proc.communicate() checkthese = [] getline = False for line in stdout_value.split('\n'): if 'Warnings saved to' in line: getline = False if getline: l = line.rstrip() if len(l): checkthese += [l] if 'SPAdes pipeline finished WITH WARNINGS!' in line: getline = True if len(checkthese): print('SPAdes completed with warnings:\n{}\n'.format('\n'.join(checkthese))) else: print('SPAdes completed without warnings') # with open('___SPAdes_{}_good_{}.log'.format(cnum, thetime), 'w') as fout: # fout.write(stdout_value) path2contigs = _os.path.sep.join([this_output_path,'contigs.fasta']) except _subprocess.CalledProcessError as e: print('SPAdes probably did not complete: error returned ({})'.format(proc.returncode)) print('Error: {}'.format(e)) print('Writing some info relevent to SPAdes crash to ___SPAdes_{}_bad_{}.log'.format(cnum, thetime)) with open('___SPAdes_{}_bad_{}.log'.format(cnum, thetime), 'w') as fout: fout.write(dir(proc)) fout.write('\n' + str(e.returncode) + '\n') fout.write(_os.path.sep.join([this_output_path,'contigs.fasta'])) path2contigs = None return(path2contigs) if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] contigs = {} if single_assembly: print('Combining reads aligned at multiple regions into single assembly') if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] for cnum, (pairname, files) in enumerate(self.read_files.items()): # allow use of tuples or dicts by converting dicts to lists if isinstance(files, dict): use_files = [] for k,v in sorted(files.items()): use_files += [v] else: use_files = files cmd += ['--pe{}-1'.format(cnum+1), use_files[0]] cmd += ['--pe{}-2'.format(cnum+1), use_files[1]] try: # use unpaired reads if available cmd += ['--pe{}-s'.format(cnum+1), use_files[2]] except IndexError: pass try: # add a second library if provided if isinstance(self.read_files2[pairname], dict): # if a dict supplied, make it a list use_files2 = [] for k,v in sorted(self.read_files2[pairname].items()): use_files2 += [v] else: use_files2 = self.read_files2[pairname] cmd += ['--pe{}-1'.format(cnum+2), use_files2[0]] cmd += ['--pe{}-2'.format(cnum+2), use_files2[1]] try: cmd += ['--pe{}-s'.format(cnum+2), use_files2[2]] except IndexError: pass except AttributeError: pass ## this isn't very flexible: # retain <sample>__<genome> from pairname: # pairname == <sample>__<genome>_<start>-<end>+<padding> # and replace with multiregion folder = '{}__{}_{}'.format(pairname.split('__')[0], pairname.split('__')[1].split('_')[0], 'multi_region') this_output_path = _os.path.sep.join(output_folder + [folder]) if not _os.path.exists(this_output_path): _os.makedirs(this_output_path) cmd += ['-o', this_output_path] cmd += ['--threads', str(max_processes)] cmd += ['--memory', str(mem_num_gigs)] if only_assembler: cmd += ['--only-assembler'] if careful: cmd += ['--careful'] thetime = _time.asctime( _time.localtime(_time.time()) ) print('about to launch SPAdes . . . at {}'.format(thetime)) print(' '.join(cmd)) contigs['multi_region'] = run_SPAdes(cmd) else: start_time = _time.time() # prepare commandline and launch each SPAdes assembly contigs = {} for cnum, (pairname, files) in enumerate(sorted(self.read_files.items())): if isinstance(use_exe, list): # allow for use of prepended executable with script to run cmd = list(use_exe) else: # or just executable cmd = [use_exe] # allow use of tuples or dicts by converting dicts to lists if isinstance(files, dict): use_files = [] for k,v in sorted(files.items()): use_files += [v] else: use_files = files cmd += ['--pe1-1', use_files[0]] cmd += ['--pe1-2', use_files[1]] try: # use unpaired reads if available cmd += ['--pe1-s', use_files[2]] except IndexError: pass try: # add a second library if provided if isinstance(self.read_files2[pairname], dict): # if a dict supplied, make it a list use_files2 = [] for k,v in sorted(self.read_files2[pairname].items()): use_files2 += [v] else: use_files2 = self.read_files2[pairname] cmd += ['--pe2-1', use_files2[0]] cmd += ['--pe2-2', use_files2[1]] try: cmd += ['--pe2-s', use_files2[2]] except IndexError: pass except AttributeError: pass this_output_path = _os.path.sep.join(output_folder + [pairname]) if not _os.path.exists(this_output_path): _os.makedirs(this_output_path) cmd += ['-o', this_output_path] cmd += ['--threads', str(max_processes)] cmd += ['--memory', str(mem_num_gigs)] if only_assembler: cmd += ['--only-assembler'] if careful: cmd += ['--careful'] thetime = _time.asctime( _time.localtime(_time.time()) ) print('about to launch SPAdes . . . at {}'.format(thetime)) print(' '.join(cmd)) contigs[pairname] = run_SPAdes(cmd) if len(self.read_files) > 1: # report durations, time left etc _report_time(start_time, cnum, len(self.read_files)) self.paths_to_contigs = contigs
def align(self, insert_size = False, path_to_exe = False, local_alns_path = ['alignments'], force = False, max_cpus = -1): if not path_to_exe: path_to_exe = _get_exe_path('bwa') # write genome sequence to a fasta file try: _os.makedirs('genome_sequences') except OSError: pass genome_fna = 'genome_sequences/%s.fna' % self.genome_id _SeqIO.write(_SeqRecord(_Seq(self.genome_sequence.tostring()), id = self.genome_id), genome_fna, 'fasta') # make folder for alignments (BAMs) local_alns_path = _os.path.sep.join(local_alns_path) if not _os.path.exists(local_alns_path): _os.makedirs(local_alns_path) # make a subdir for this genome local_alns_path_genome = _os.path.sep.join([ local_alns_path, self.genome_id]) if not _os.path.exists(local_alns_path_genome): _os.makedirs(local_alns_path_genome) max_processes = _decide_max_processes( max_cpus ) e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.' assert hasattr(self, 'read_files'), e1 e2 = 'Could not find %s. Either run trim() again or ensure file exists' for pairname, files in self.read_files.items(): assert _os.path.exists(files[1]), e2 % files[1] assert _os.path.exists(files[2]), e2 % files[2] have_index_files = [_os.path.exists(genome_fna + '.' + a) for a in ('ann','pac','amb','bwt','sa')] if not all(have_index_files): print('Writing BWA index files for %s' % genome_fna) _subprocess.call([path_to_exe, 'index', genome_fna]) aligned_read_files = {} for pairname,files in self.read_files.items(): RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname,pairname) if insert_size: cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-I', insert_size, '-R', RGinfo, genome_fna, files[1], files[2]] else: # BWA can estimate on-the-fly cmd = [path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna, files[1], files[2]] out_sam = _os.path.sep.join([local_alns_path_genome, '%s__%s.sam' % (pairname, self.genome_id)]) if not _os.path.exists(out_sam) or force: print('Called: "%s"' % ' '.join(cmd)) with open(out_sam, "wb") as out: _subprocess.call(cmd, stdout = out) else: print('Found:') print(out_sam) print('use "force = True" to overwrite') print(' '.join(cmd)) aligned_read_files[pairname] = out_sam self.aligned_read_files = aligned_read_files
def align(self, insert_size=False, path_to_exe=False, local_alns_path=['alignments'], force=False, max_cpus=-1): if not path_to_exe: path_to_exe = _get_exe_path('bwa') # write genome sequence to a fasta file try: _os.makedirs('genome_sequences') except OSError: pass genome_fna = 'genome_sequences/%s.fna' % self.genome_id _SeqIO.write( _SeqRecord(_Seq(self.genome_sequence.tostring()), id=self.genome_id), genome_fna, 'fasta') # make folder for alignments (BAMs) local_alns_path = _os.path.sep.join(local_alns_path) if not _os.path.exists(local_alns_path): _os.makedirs(local_alns_path) # make a subdir for this genome local_alns_path_genome = _os.path.sep.join( [local_alns_path, self.genome_id]) if not _os.path.exists(local_alns_path_genome): _os.makedirs(local_alns_path_genome) max_processes = _decide_max_processes(max_cpus) e1 = 'Could not find "read_files" attribute. Before aligning to genome, reads must be quality score trimmed. Please run trim() method on this Reads instance.' assert hasattr(self, 'read_files'), e1 e2 = 'Could not find %s. Either run trim() again or ensure file exists' for pairname, files in self.read_files.items(): assert _os.path.exists(files[1]), e2 % files[1] assert _os.path.exists(files[2]), e2 % files[2] # always (re)index in case of upstream changes in data print('Writing BWA index files for %s' % genome_fna) _subprocess.call([path_to_exe, 'index', genome_fna]) aligned_read_files = {} for pairname, files in self.read_files.items(): RGinfo = r"@RG\tID:%s\tSM:%s\tPL:ILLUMINA" % (pairname, pairname) if insert_size: cmd = [ path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-I', insert_size, '-R', RGinfo, genome_fna, files[1], files[2] ] else: # BWA can estimate on-the-fly cmd = [ path_to_exe, 'mem', '-t', str(max_processes), '-M', '-a', '-R', RGinfo, genome_fna, files[1], files[2] ] out_sam = _os.path.sep.join([ local_alns_path_genome, '%s__%s.sam' % (pairname, self.genome_id) ]) if not _os.path.exists(out_sam) or force: print('Called: "%s"' % ' '.join(cmd)) with open(out_sam, "wb") as out: _subprocess.call(cmd, stdout=out) else: print('Found:') print(out_sam) print('use "force = True" to overwrite') print(' '.join(cmd)) aligned_read_files[pairname] = out_sam self.aligned_read_files = aligned_read_files
def generateReads(self, path_to_exe = False, paths_to_genomes = False, readcov = 60, readlen = 100, fraglen = 350, sterrfraglen = 20, model = 4, max_cpus = -1): ''' Call GemSIM to generate reads Need to have written genome sequences to generate from, possibly with generated SNPs, small indels and large deletions. ''' #max_cpus etc if paths_to_genomes: use_genomes = sorted(paths_to_genomes) elif hasattr(self, 'written_genomes'): use_genomes = sorted(self.written_genomes) else: raise ValueError('provide either paths_to_genomes or generate some then .writeSequences()') if not path_to_exe: path_to_exe = _get_exe_path('gemsim') comment2 = ''' to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands: GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01 ''' num_pairs = len(self.genome.sequence) * readcov / (readlen*2) if model == 4: path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v4_p.gzip']) elif model == 5: path_to_model = _os.path.sep.join(path_to_exe.split(_os.path.sep)[:-1] + ['models','ill100v5_p.gzip']) print('Using error model: {}'.format(path_to_model)) print('Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})'.format( num_pairs, readlen, readcov, len(self.genome.sequence), self.genome.id)) processes = set() max_processes = _decide_max_processes( max_cpus ) import time start = time.time() out_raw = [] for i,genome_in in enumerate(use_genomes): # could use per genome length . . less consistent than using reference # genome_len = len(_SeqIO.read(genome_in,'fasta').seq) # num_pairs = genome_len * readcov / (readlen*2) outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i+1) cmd = [path_to_exe, '-r', genome_in, '-n', num_pairs, '-l', 'd', '-u', fraglen, '-s', sterrfraglen, '-m', path_to_model, '-c', '-q', 33, '-p', '-o', outprefix] out_raw += [outprefix+'_fir.fastq', outprefix+'_sec.fastq'] # this would be better to rename and compress all in one # maybe as a shell script? Then resuming (--force) would be easier. if _os.path.exists(outprefix+'_fir.fastq') and \ _os.path.exists(outprefix+'_sec.fastq'): print('Found output for {}_fir.fastq (and sec), not regenerating, '\ 'delete these to start from scratch'.format(outprefix)) else: cmd = map(str,cmd) print(' '.join(cmd)) processes.add( _subprocess.Popen(cmd, shell=False) ) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() missing = [] for o in out_raw: if not _os.path.exists(o): missing += [o] assert len(missing) == 0, 'Could not find:\n{}'.format('\n'.join(missing)) print('all finished after {} minutes'.format(int(round((time.time() - start)/60.0)))) outdir = _os.path.sep.join(['simulated_reads',self.genome.id]) try: _os.makedirs(outdir) except OSError: pass for o in out_raw: new = _os.path.sep.join([outdir, o.replace('fir','R1').replace('sec','R2')]) print('{} ==> {}'.format(o, new)) _os.rename(o, new) cmd = ['gzip', new] print(' '.join(cmd)) _subprocess.call(cmd)
def get_download(name, description, source, url, commit, checksum, destination, preparation, checker): ''' Download and unpack a dependancy ''' ## initialdir = _os.path.abspath(_os.curdir) try: _os.chdir(destination) except OSError: _os.makedirs(destination) _os.chdir(destination) if checksum: hasher_algorithm = checksum.split('=')[0] if hasher_algorithm == 'md5': hasher = _hashlib.md5() elif hasher_algorithm == 'sha1': hasher = _hashlib.sha1() elif hasher_algorithm == 'sha224': hasher = _hashlib.sha224() elif hasher_algorithm == 'sha256': hasher = _hashlib.sha256() elif hasher_algorithm == 'sha384': hasher = _hashlib.sha384() elif hasher_algorithm == 'sha512': hasher = _hashlib.sha512() else: print("{} checksums not implemented in Python's hashlib!".format(hasher_algorithm)) print('Downloading: %s' % url) req = _urllib2.urlopen(url) CHUNK = 16 * 1024 * 16 data = _cStringIO.StringIO() c = 0 for chunk in iter(lambda: req.read(CHUNK), ''): c += CHUNK print("{:,} bytes".format(c)) data.write(chunk) print('Download complete . . .') data.seek(0) if checksum: buff = data.read(65536) while len(buff) > 0: hasher.update(buff) buff = data.read(65536) e = '. . . checksum fail!' assert hasher.hexdigest() == checksum.split('=')[1], e print('. . . checksum passed!') data.seek(0) if url[-6:] == 'tar.gz': archive = _tarfile.open(mode="r:gz", fileobj = data) elif url[-7:] == 'tar.bz2': archive = _tarfile.open(mode="r:bz2", fileobj = data) elif url[-4:] == '.zip': archive = _zipfile.ZipFile(data) if destination == 'local_packages': # extract as a pypi python package release = url.split('/')[-1][:-7] print('Extracting {} to {}'.format(release, _os.path.sep.join([destination,name]))) c = 0 nostrip = {'pysam'} if name in nostrip: try: _shutil.rmtree(archive.getnames()[0]) except OSError: pass #_shutil.rmtree(_os.path.sep.join([destination, archive.getnames()[0]])) # some python modules should not be stripped . . more complex install for member in archive.getmembers(): if member.isreg(): archive.extract(member) print(member.name) c += 1 else: # others don't need additional compilation check_path1 = '{}/{}'.format(release,name) for member in archive.getmembers(): if member.isreg() and check_path1 in member.name: member.name = _os.path.sep.join(member.name.split(_os.path.sep)[1:]) archive.extract(member) c += 1 print('Extracted {} files'.format(c)) else: # extract as a generic external program archive.extractall() if preparation: for do_this in preparation: if 'just_packages' in do_this['arguments']: # this is the only thing that differentiates this prepare() # from others that need some chdir <== this should be improved # see dep dict curdir = _os.path.abspath(_os.curdir) _os.chdir(_os.path.pardir) do_this['function'](*do_this['arguments']['package_list']) # return to previous folder _os.chdir(curdir) else: extracted_base_dir = archive.getnames()[0].split(_os.path.sep)[0] curdir = _os.path.abspath(_os.curdir) # go to installed folder _os.chdir(_os.path.sep.join([destination,extracted_base_dir])) do_this['function'](**do_this['arguments']) # return to previous folder _os.chdir(curdir) _os.chdir(initialdir)
def generateReads(self, path_to_exe=False, paths_to_genomes=False, readcov=60, readlen=100, fraglen=350, sterrfraglen=20, model=4, max_cpus=-1): ''' Call GemSIM to generate reads Need to have written genome sequences to generate from, possibly with generated SNPs, small indels and large deletions. ''' #max_cpus etc if paths_to_genomes: use_genomes = sorted(paths_to_genomes) elif hasattr(self, 'written_genomes'): use_genomes = sorted(self.written_genomes) else: raise ValueError( 'provide either paths_to_genomes or generate some then .writeSequences()' ) if not path_to_exe: path_to_exe = _get_exe_path('gemsim') comment2 = ''' to generate reads put GemSIM v1.6 into subfolder GemSIM_v1.6 and issue these commands: GemSIM_v1.6/GemReads.py -r LESB58_for_GemSim_01.fasta -n 1980527 -l d -u 350 -s 20 -m GemSIM_v1.6/models/ill100v4_p.gzip -c -q 33 -p -o GemSimLESB58_01 ''' num_pairs = len(self.genome.sequence) * readcov / (readlen * 2) if model == 4: path_to_model = _os.path.sep.join( path_to_exe.split(_os.path.sep)[:-1] + ['models', 'ill100v4_p.gzip']) elif model == 5: path_to_model = _os.path.sep.join( path_to_exe.split(_os.path.sep)[:-1] + ['models', 'ill100v5_p.gzip']) print('Using error model: {}'.format(path_to_model)) print( 'Generating {:,} {}bp read pairs for {}x coverage depth of a {}bp genome ({})' .format(num_pairs, readlen, readcov, len(self.genome.sequence), self.genome.id)) processes = set() max_processes = _decide_max_processes(max_cpus) import time start = time.time() out_raw = [] for i, genome_in in enumerate(use_genomes): # could use per genome length . . less consistent than using reference # genome_len = len(_SeqIO.read(genome_in,'fasta').seq) # num_pairs = genome_len * readcov / (readlen*2) outprefix = 'GemSim_{}_{:02d}'.format(self.genome.id, i + 1) cmd = [ path_to_exe, '-r', genome_in, '-n', num_pairs, '-l', 'd', '-u', fraglen, '-s', sterrfraglen, '-m', path_to_model, '-c', '-q', 33, '-p', '-o', outprefix ] out_raw += [outprefix + '_fir.fastq', outprefix + '_sec.fastq'] # this would be better to rename and compress all in one # maybe as a shell script? Then resuming (--force) would be easier. if _os.path.exists(outprefix+'_fir.fastq') and \ _os.path.exists(outprefix+'_sec.fastq'): print('Found output for {}_fir.fastq (and sec), not regenerating, '\ 'delete these to start from scratch'.format(outprefix)) else: cmd = map(str, cmd) print(' '.join(cmd)) processes.add(_subprocess.Popen(cmd, shell=False)) if len(processes) >= max_processes: (pid, exit_status) = _os.wait() processes.difference_update( [p for p in processes if p.poll() is not None]) # Check if all the child processes were closed for p in processes: if p.poll() is None: p.wait() missing = [] for o in out_raw: if not _os.path.exists(o): missing += [o] assert len(missing) == 0, 'Could not find:\n{}'.format( '\n'.join(missing)) print('all finished after {} minutes'.format( int(round((time.time() - start) / 60.0)))) outdir = _os.path.sep.join(['simulated_reads', self.genome.id]) try: _os.makedirs(outdir) except OSError: pass for o in out_raw: new = _os.path.sep.join( [outdir, o.replace('fir', 'R1').replace('sec', 'R2')]) print('{} ==> {}'.format(o, new)) _os.rename(o, new) cmd = ['gzip', new] print(' '.join(cmd)) _subprocess.call(cmd)